In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

sns.set()  # make plots nicer

np.random.seed(42)  # set seed for reproducibility

In [2]:
def file_parser_with_prev_next(path):
    file = open(path, 'r')
    lines = file.readlines()
    
    file_name = [path.split('/')[-1]]
    sentence = ""
    file_data = []
    
    has_value = False
    previous = 0
    
    for line in lines:
        line = line.strip()
        
        # if there are only 2 informations on line and second is h#, then ignore
        # strip line, split primarly on ; secondary on ,
        if (line.startswith('#')):
            if (not sentence):
                sentence = line[len('# Sentence: "'): len(line) - 1]
            continue
        
        line = line.split(';')
        
        if (len(line) == 1):
            #lines containing only their packet size and nothing else, they should be added
            #TODO
            line += [""]
            line += [""]
            #continue
        
        if (len(line) == 2):
            #this tries to remove most of the silence at the start of the recording
            #potentionally harmfull as we shouldn't clean test data this way (we will be reading labels)
            #if (line[1] == 'h#'):
            #    continue
            line += [""]
        
        line[1] = tuple(line[1].split(','))
        line[2] = tuple(list(map(lambda a: a.strip('"'), line[2].split(','))))
        
        if (has_value):
            file_data[-1][-4] = line[0]
           
        # file_type and sentence contain duplicate informations, but are kept for readability
        split_filename = file_name[0].split('-')
        
        line = file_name + [split_filename[0]] + [split_filename[1]] + [split_filename[2][0:-4]] + [sentence] + [previous] + [0] + line
        #adding previous as feature
        previous = line[-3]
        file_data += [line]
        
        #adding next frame as feature
        has_value = True
        
    return pd.DataFrame(file_data, columns=['file', 'dialect', 'speaker', 'sentence_id', 'sentence', 'previous_packet', 'next_packet','packet_size', 'phonemes', 'words'])

def load_files_with_prev_next(directory):
    filelist = os.listdir(directory)
    #read them into pandas
    df_list = [file_parser_with_prev_next(directory+file) for file in filelist]
    #concatenate them together
    return pd.concat(df_list, ignore_index=True)

def convert_types(data_frame):
    data_frame['packet_size'] = pd.to_numeric(data_frame['packet_size'])
    data_frame['previous_packet'] = pd.to_numeric(data_frame['previous_packet'])
    data_frame['next_packet'] = pd.to_numeric(data_frame['next_packet'])

    data_frame['file'] = data_frame['file'].astype('category')
    data_frame['sentence'] = data_frame['sentence'].astype('category')
    
    data_frame['dialect'] = data_frame['dialect'].astype('category')
    data_frame['speaker'] = data_frame['speaker'].astype('category')
    data_frame['sentence_id'] = data_frame['sentence_id'].astype('category')

In [3]:
skype_data_train = load_files_with_prev_next("./../data/skype_train_data/")
skype_data_test = load_files_with_prev_next("./../data/skype_test_data/")
convert_types(skype_data_train)
convert_types(skype_data_test)
skype_data_test

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,phonemes,words
0,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,0,35,30,"(h#,)","(,)"
1,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,30,43,35,"(h#,)","(,)"
2,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,35,26,43,"(h#,)","(,)"
3,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,43,30,26,"(h#,)","(,)"
4,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,26,31,30,"(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...
258516,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,40,43,46,"(h#,)","(,)"
258517,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,46,41,43,"(h#,)","(,)"
258518,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,43,34,41,"(h#,)","(,)"
258519,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,41,33,34,"(h#,)","(,)"


In [4]:
def add_surrounding(data_frame):
    data_frame['prev_curr'] = list(zip(data_frame.previous_packet, data_frame.packet_size))
    data_frame['next_curr'] = list(zip(data_frame.next_packet, data_frame.packet_size))
    data_frame['packet_surrounding'] = list(zip(data_frame.previous_packet, data_frame.packet_size, data_frame.next_packet))
    
    #data_frame['prev_curr'] = data_frame['prev_curr'].astype('category')
    #data_frame['next_curr'] = data_frame['next_curr'].astype('category')
    #data_frame['packet_surrounding'] = data_frame['packet_surrounding'].astype('category')

add_surrounding(skype_data_train)
add_surrounding(skype_data_test)

skype_data_train = skype_data_train[['file', 'dialect', 'speaker', 'sentence_id', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
skype_data_test = skype_data_test[['file', 'dialect', 'speaker', 'sentence_id', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
skype_data_train

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,0,32,32,"(0, 32)","(32, 32)","(0, 32, 32)","(h#,)","(,)"
1,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,31,32,"(32, 32)","(31, 32)","(32, 32, 31)","(h#,)","(,)"
2,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,28,31,"(32, 31)","(28, 31)","(32, 31, 28)","(h#,)","(,)"
3,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,31,28,28,"(31, 28)","(28, 28)","(31, 28, 28)","(h#,)","(,)"
4,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,28,36,28,"(28, 28)","(36, 28)","(28, 28, 36)","(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
707433,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,47,34,32,"(47, 32)","(34, 32)","(47, 32, 34)","(h#,)","(,)"
707434,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,32,39,34,"(32, 34)","(39, 34)","(32, 34, 39)","(h#,)","(,)"
707435,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,34,33,39,"(34, 39)","(33, 39)","(34, 39, 33)","(h#,)","(,)"
707436,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,39,36,33,"(39, 33)","(36, 33)","(39, 33, 36)","(h#,)","(,)"


### Data preparation

In [5]:
skype_data_train.loc[:, ["previous_packet", "packet_size", "next_packet"]]

Unnamed: 0,previous_packet,packet_size,next_packet
0,0,32,32
1,32,32,31
2,32,31,28
3,31,28,28
4,28,28,36
...,...,...,...
707433,47,32,34
707434,32,34,39
707435,34,39,33
707436,39,33,36


In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [7]:
# add removal of labels for the test_dataset
def get_labels(df, label=["words"], feature=["previous_packet", "packet_size", "next_packet"]):
    labels = df.loc[:, label]
    features = df.loc[:, feature]
    return features, labels

In [8]:
def prepare_labels(train_labels, test_labels, label=["words"]):
    train_labels = train_labels.astype('category')
    test_labels = test_labels.astype('category')
    
    total_labels = train_labels.append(test_labels)
    
    lab_enc = LabelEncoder()
    lab_enc.fit(total_labels[label])

    train_labels = lab_enc.transform(train_labels[label])
    test_labels = lab_enc.transform(test_labels[label])
    
    return train_labels, test_labels, lab_enc

### Additional preprocessing

In [9]:
# there are no "empty" phonemes
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_train

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,0,32,32,"(0, 32)","(32, 32)","(0, 32, 32)","(h#,)","(,)"
1,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,31,32,"(32, 32)","(31, 32)","(32, 32, 31)","(h#,)","(,)"
2,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,28,31,"(32, 31)","(28, 31)","(32, 31, 28)","(h#,)","(,)"
3,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,31,28,28,"(31, 28)","(28, 28)","(31, 28, 28)","(h#,)","(,)"
4,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,28,36,28,"(28, 28)","(36, 28)","(28, 28, 36)","(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
533700,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,47,34,32,"(47, 32)","(34, 32)","(47, 32, 34)","(h#,)","(,)"
533701,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,32,39,34,"(32, 34)","(39, 34)","(32, 34, 39)","(h#,)","(,)"
533702,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,34,33,39,"(34, 39)","(33, 39)","(34, 39, 33)","(h#,)","(,)"
533703,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,39,36,33,"(39, 33)","(36, 33)","(39, 33, 36)","(h#,)","(,)"


In [10]:
len(pd.unique(single_phonemes_train.phonemes))

62

In [11]:
pd.unique(single_phonemes_train.phonemes)

array([('h#',), ('sh',), ('ix',), ('hv',), ('eh',), ('jh',), ('ih',),
       ('dcl',), ('ah',), ('kcl',), ('k',), ('s',), ('ux',), ('q',),
       ('en',), ('r',), ('w',), ('ao',), ('axr',), ('l',), ('y',),
       ('uh',), ('n',), ('ae',), ('dx',), ('oy',), ('ax',), ('gcl',),
       ('dh',), ('tcl',), ('iy',), ('v',), ('t',), ('f',), ('ow',),
       ('d',), ('hh',), ('ch',), ('bcl',), ('aa',), ('em',), ('ng',),
       ('m',), ('ay',), ('th',), ('ax-h',), ('ey',), ('p',), ('pcl',),
       ('aw',), ('er',), ('z',), ('epi',), ('el',), ('uw',), ('g',),
       ('',), ('b',), ('pau',), ('zh',), ('nx',), ('eng',)], dtype=object)

Taken from: https://github.com/jhasegaw/phonecodes/blob/master/src/phonecode_tables.py

In [12]:
arpa_to_ipa = {
    'aa':'ɑ',
    'ae':'æ',
    'ah':'ʌ',
    'ah0':'ə',
    'ao':'ɔ',
    'aw':'aʊ',
    'ay':'aɪ',
    'eh':'ɛ',
    'er':'ɝ',
    'er0':'ɚ',
    'ey':'eɪ',
    'ih':'ɪ',
    'ih0':'ɨ',
    'iy':'i',
    'ow':'oʊ',
    'oy':'ɔɪ',
    'uh':'ʊ',
    'uw':'u',
    'b':'b',
    'ch':'tʃ',
    'd':'d',
    'dh':'ð',
    'el':'l̩',
    'em':'m̩',
    'en':'n̩',
    'f':'f',
    'g':'ɡ',
    'hh':'h',
    'jh':'dʒ',
    'k':'k',
    'l':'l',
    'm':'m',
    'n':'n',
    'ng':'ŋ',
    'p':'p',
    'q':'ʔ',
    'r':'ɹ',
    's':'s',
    'sh':'ʃ',
    't':'t',
    'th':'θ',
    'v':'v',
    'w':'w',
    'wh':'ʍ',
    'y':'j',
    'z':'z',
    'zh':'ʒ',

    'ax':'ə',
    'ax-h':'ə̥',
    'axr':'ɚ',
    'bcl':'b',
    'dcl':'d',
    'dx':'ɾ',
    'eng':'ŋ̍',
    'epi':'',
    'gcl':'g',
    'hv':'ɦ',
    'h#':'',
    'ix':'ɨ',
    'kcl':'k',
    'nx':'ɾ̃',
    'pau':'',
    'pcl':'p',
    'tcl':'t',
    'ux':'ʉ',
    '':'',
}

In [13]:
len(arpa_to_ipa)

66

This modification is based on this: https://en.wikipedia.org/wiki/ARPABET (+ minor guessing)

In [14]:
ipa_allophone = {
    'ŋ̍':'n', #should be ŋ
    'ə̥':'ɛ',
    'ɨ':'ɪ',
    'n̩':'n',
    'm̩':'m',
    'ŋ':'n',
    'ɾ̃':'n',
    'ð':'θ',
    'ʉ':'u',
    'ɾ':'d',
    'l̩':'l',
}

In [15]:
uniq_phon = pd.unique(single_phonemes_train.phonemes)
for i in range(len(uniq_phon)):
    uniq_phon[i] = arpa_to_ipa.get(uniq_phon[i][0], uniq_phon[i][0])
    uniq_phon[i] = ipa_allophone.get(uniq_phon[i], uniq_phon[i])
    
uniq_phon

array(['', 'ʃ', 'ɪ', 'ɦ', 'ɛ', 'dʒ', 'ɪ', 'd', 'ʌ', 'k', 'k', 's', 'u',
       'ʔ', 'n', 'ɹ', 'w', 'ɔ', 'ɚ', 'l', 'j', 'ʊ', 'n', 'æ', 'd', 'ɔɪ',
       'ə', 'g', 'θ', 't', 'i', 'v', 't', 'f', 'oʊ', 'd', 'h', 'tʃ', 'b',
       'ɑ', 'm', 'n', 'm', 'aɪ', 'θ', 'ɛ', 'eɪ', 'p', 'p', 'aʊ', 'ɝ', 'z',
       '', 'l', 'u', 'ɡ', '', 'b', '', 'ʒ', 'n', 'n'], dtype=object)

In [16]:
len(np.unique(uniq_phon))

43

Now modifying our input dataset:

In [17]:
#input is expected to be a tuple
def convert_phoneme(phoneme):
    tmp_1 = arpa_to_ipa.get(phoneme[0], phoneme[0])
    tmp_2 = ipa_allophone.get(tmp_1, tmp_1)
    return tmp_2

In [18]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,0,35,30,"(0, 30)","(35, 30)","(0, 30, 35)","(h#,)","(,)"
1,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,30,43,35,"(30, 35)","(43, 35)","(30, 35, 43)","(h#,)","(,)"
2,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,35,26,43,"(35, 43)","(26, 43)","(35, 43, 26)","(h#,)","(,)"
3,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,43,30,26,"(43, 26)","(30, 26)","(43, 26, 30)","(h#,)","(,)"
4,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,26,31,30,"(26, 30)","(31, 30)","(26, 30, 31)","(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195608,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,40,43,46,"(40, 46)","(43, 46)","(40, 46, 43)","(h#,)","(,)"
195609,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,46,41,43,"(46, 43)","(41, 43)","(46, 43, 41)","(h#,)","(,)"
195610,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,43,34,41,"(43, 41)","(34, 41)","(43, 41, 34)","(h#,)","(,)"
195611,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,41,33,34,"(41, 34)","(33, 34)","(41, 34, 33)","(h#,)","(,)"


In [19]:
single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)
single_phonemes_train

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,0,32,32,"(0, 32)","(32, 32)","(0, 32, 32)",,"(,)"
1,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,31,32,"(32, 32)","(31, 32)","(32, 32, 31)",,"(,)"
2,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,28,31,"(32, 31)","(28, 31)","(32, 31, 28)",,"(,)"
3,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,31,28,28,"(31, 28)","(28, 28)","(31, 28, 28)",,"(,)"
4,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,28,36,28,"(28, 28)","(36, 28)","(28, 28, 36)",,"(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
533700,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,47,34,32,"(47, 32)","(34, 32)","(47, 32, 34)",,"(,)"
533701,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,32,39,34,"(32, 34)","(39, 34)","(32, 34, 39)",,"(,)"
533702,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,34,33,39,"(34, 39)","(33, 39)","(34, 39, 33)",,"(,)"
533703,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,39,36,33,"(39, 33)","(36, 33)","(39, 33, 36)",,"(,)"


In [20]:
tmp = pd.unique(single_phonemes_test.phonemes)
print(len(tmp))
tmp

43


array(['', 'ʃ', 'i', 'ɦ', 'æ', 'd', 'ɝ', 'ɑ', 'ɹ', 'k', 's', 'u', 'ɪ',
       'n', 'g', 'ɡ', 'w', 'ʔ', 'ɔ', 'l', 'j', 'ɚ', 'oʊ', 't', 'ɛ', 'ɔɪ',
       'aɪ', 'θ', 'h', 'z', 'p', 'ə', 'b', 'f', 'v', 'm', 'aʊ', 'ʌ', 'eɪ',
       'tʃ', 'ʊ', 'dʒ', 'ʒ'], dtype=object)

In [21]:
train_set, train_labels = get_labels(single_phonemes_train, label=['phonemes'])
test_set, test_labels = get_labels(single_phonemes_test, label=['phonemes'])

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.phonemes)))
print(len(pd.unique(test_labels.phonemes)))
total_unique_phonemes = len(pd.unique(total_labels.phonemes))
total_unique_phonemes

43
43


43

In [22]:
train_set, train_labels = get_labels(single_phonemes_train)
test_set, test_labels = get_labels(single_phonemes_test)

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.words)))
print(len(pd.unique(test_labels.words)))
total_unique_words = len(pd.unique(total_labels.words))
total_unique_words

5104
2464


6387

### Merging long sequencess

In [23]:
tmp = single_phonemes_train[["file", "phonemes", "packet_surrounding"]][1:].reset_index(drop=True) != single_phonemes_train[["file", "phonemes", "packet_surrounding"]][:-1]
prev_not_same = (tmp.phonemes | tmp.packet_surrounding)
prev_not_same

0         True
1         True
2         True
3         True
4         True
          ... 
533699    True
533700    True
533701    True
533702    True
533703    True
Length: 533704, dtype: bool

In [24]:
prev_not_same.loc[-1] = True
prev_not_same.index = prev_not_same.index + 1
prev_not_same.sort_index(inplace=True)
prev_not_same

0         True
1         True
2         True
3         True
4         True
          ... 
533700    True
533701    True
533702    True
533703    True
533704    True
Length: 533705, dtype: bool

In [25]:
single_phonemes_train.loc[~prev_not_same.values]

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
3542,DR1-FECD0-SI1418.CSV,DR1,FECD0,SI1418,Personal predispositions tend to blunt the ear...,39,39,39,"(39, 39)","(39, 39)","(39, 39, 39)",,"(,)"
5220,DR1-FETB0-SX248.CSV,DR1,FETB0,SX248,Reading in poor light gives you eyestrain.,33,33,33,"(33, 33)","(33, 33)","(33, 33, 33)",,"(,)"
6012,DR1-FJSP0-SI1763.CSV,DR1,FJSP0,SI1763,That's your headache.,42,42,42,"(42, 42)","(42, 42)","(42, 42, 42)",,"(,)"
6559,DR1-FJSP0-SX444.CSV,DR1,FJSP0,SX444,The toddler found a clamshell near the camp site.,42,42,42,"(42, 42)","(42, 42)","(42, 42, 42)",,"(,)"
8242,DR1-FMEM0-SA2.CSV,DR1,FMEM0,SA2,Don't ask me to carry an oily rag like that.,30,30,30,"(30, 30)","(30, 30)","(30, 30, 30)",,"(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
515430,DR8-FMBG0-SA1.CSV,DR8,FMBG0,SA1,She had your dark suit in greasy wash water al...,30,30,30,"(30, 30)","(30, 30)","(30, 30, 30)",,"(,)"
518926,DR8-MBCG0-SI2217.CSV,DR8,MBCG0,SI2217,"He'd not only told me so, he'd proved it.",63,63,63,"(63, 63)","(63, 63)","(63, 63, 63)",i,"(he'd,)"
524746,DR8-MKRG0-SX31.CSV,DR8,MKRG0,SX31,A good attitude is unbeatable.,38,38,38,"(38, 38)","(38, 38)","(38, 38, 38)",,"(,)"
525211,DR8-MMEA0-SI2018.CSV,DR8,MMEA0,SI2018,They were shattered.,27,27,27,"(27, 27)","(27, 27)","(27, 27, 27)",,"(,)"


This is going to only remove around 100 values, which is literally nothing => this won't help us.

## Models:

In [26]:
import keras
import tensorflow as tf

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import classification_report

In [27]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)

single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)

In [28]:
train_set, train_labels = get_labels(single_phonemes_train, label=['phonemes'])
test_set, test_labels = get_labels(single_phonemes_test, label=['phonemes'])

train_labels, test_labels, encoder = prepare_labels(train_labels, test_labels, label=['phonemes'])

  return f(*args, **kwargs)


In [29]:
print(train_set.shape)
print(train_labels.shape)
train_labels

(533705, 3)
(533705,)


array([0, 0, 0, ..., 0, 0, 0])

In [30]:
encoder.inverse_transform(train_labels)

array(['', '', '', ..., '', '', ''], dtype=object)

In [31]:
train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(533705, 43)


In [32]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_phonemes, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.fit(train_set, train_labels, epochs=16, batch_size=256)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7fc1f8644198>

In [34]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [2.7818515300750732, 0.251639723777771]


#### Looking into predictions

In [35]:
predictions = model.predict(train_set)
print(predictions[0])
np.argmax(predictions[0])

[1.0000000e+00 1.5664994e-18 5.1657892e-18 3.4360964e-13 7.2738590e-12
 8.2338135e-16 1.3029220e-17 1.8735704e-15 7.4111111e-13 1.3470942e-17
 5.6108193e-16 4.0928293e-16 1.1059671e-10 1.0719312e-15 2.2022715e-16
 2.3376375e-14 9.0254072e-21 5.6870753e-10 3.0952113e-15 1.3630438e-09
 2.9085349e-16 2.5611391e-16 1.4442367e-14 3.0740780e-15 8.8032828e-16
 5.3163752e-15 2.8342186e-17 7.0128281e-18 1.8402124e-18 7.9902635e-16
 3.0420285e-18 2.8845040e-15 4.6031349e-22 9.2929475e-15 2.4367294e-28
 5.5194319e-15 4.5049643e-16 6.2950621e-19 3.0640642e-25 6.9720208e-17
 2.4212579e-26 3.7422556e-12 5.0861529e-12]


0

In [36]:
predictions_array = [np.argmax(x) for x in predictions]
encoder.inverse_transform(predictions_array)

array(['', '', '', ..., '', '', ''], dtype=object)

In [37]:
comparison = single_phonemes_train.assign(predictions=encoder.inverse_transform(predictions_array))
compar = comparison[["phonemes", "predictions", "words"]]
compar

Unnamed: 0,phonemes,predictions,words
0,,,"(,)"
1,,,"(,)"
2,,,"(,)"
3,,,"(,)"
4,,,"(,)"
...,...,...,...
533700,,,"(,)"
533701,,,"(,)"
533702,,,"(,)"
533703,,,"(,)"


In [38]:
for phoneme in pd.unique(compar.phonemes):
    tmp = compar.loc[comparison.phonemes == phoneme]
    print("\n\nNow showing values for phoneme:", phoneme)
    print("Accuraccy:", len(tmp.loc[tmp.phonemes == tmp.predictions])/len(compar))
    display(tmp.predictions.value_counts())



Now showing values for phoneme: 
Accuraccy: 0.159762415566652


     85266
s     1421
n     1313
ɪ     1172
i     1076
k      908
æ      904
t      867
d      246
ɹ       34
ʔ       18
w       10
θ        7
p        2
Name: predictions, dtype: int64



Now showing values for phoneme: ʃ
Accuraccy: 0.0


s    8794
ɪ     954
z     336
k     149
ɹ     123
i      93
æ      81
n      41
t      28
ɔ      20
        9
ɦ       3
d       1
ʔ       1
l       1
Name: predictions, dtype: int64



Now showing values for phoneme: ɪ
Accuraccy: 0.01539989319942665


s     12167
ɪ      8219
i      2256
æ      1866
ɹ      1135
        738
n       680
z       452
k       372
ɔ       147
t        82
d        50
ɦ        45
w        19
ʔ        13
θ         3
l         2
dʒ        1
Name: predictions, dtype: int64



Now showing values for phoneme: ɦ
Accuraccy: 0.00018362203839199557


s    1240
z     406
ɪ     331
ɹ     311
ɔ     108
ɦ      98
æ      91
i      82
       41
k      18
n      14
d       2
t       2
w       1
Name: predictions, dtype: int64



Now showing values for phoneme: ɛ
Accuraccy: 0.0


s     6721
ɪ     3529
i     1098
æ      890
ɹ      484
       352
z      330
n      241
k      117
ɔ       62
t       38
d       20
ɦ        8
w        7
ʔ        5
l        3
θ        3
dʒ       1
f        1
Name: predictions, dtype: int64



Now showing values for phoneme: dʒ
Accuraccy: 1.8736942693060773e-06


s     1110
ɪ      365
k      356
ɹ      130
t      125
z       92
æ       24
        24
i       13
ɔ       12
w        8
n        5
ɦ        5
θ        3
ʔ        3
dʒ       1
Name: predictions, dtype: int64



Now showing values for phoneme: d
Accuraccy: 0.0012141538865103382


     4672
k    1480
t     854
d     648
ɪ     573
n     500
s     480
i     261
æ     121
ɹ     111
ʔ      34
ɔ      16
θ      14
z       9
w       7
ɦ       4
p       1
Name: predictions, dtype: int64



Now showing values for phoneme: ʌ
Accuraccy: 0.0


s    3789
ɪ    2048
i     533
æ     476
ɹ     355
z     283
      133
n      93
k      83
ɔ      59
ɦ      20
t      15
d      12
w       5
ʔ       3
θ       2
Name: predictions, dtype: int64



Now showing values for phoneme: k
Accuraccy: 0.007352376312757048


      7327
k     3924
s     2469
t     2147
ɪ     1491
æ      448
n      429
i      379
d      350
ɹ      176
ʔ       38
w       25
p       15
θ       13
z        5
ɔ        4
l        2
ɦ        1
dʒ       1
Name: predictions, dtype: int64



Now showing values for phoneme: s
Accuraccy: 0.0503143122136761


s    26853
ɪ     4498
z      899
æ      607
ɹ      509
k      417
i      415
n      114
t       81
ɔ       69
        55
w        6
θ        4
ɦ        3
ʔ        3
d        2
Name: predictions, dtype: int64



Now showing values for phoneme: u
Accuraccy: 0.0


s    3479
ɪ    2684
i    1362
æ    1258
     1010
n     396
ɹ     325
k      99
z      54
ɔ      19
d      17
w      10
ɦ       7
ʔ       6
t       5
l       5
θ       3
f       1
Name: predictions, dtype: int64



Now showing values for phoneme: ʔ
Accuraccy: 0.00010117949054252818


     2408
s    1151
ɪ    1113
k     882
i     562
n     512
æ     463
t     380
d     237
ɹ     217
ʔ      54
z      31
w      22
θ      20
ɔ      13
ɦ       4
p       3
f       1
Name: predictions, dtype: int64



Now showing values for phoneme: n
Accuraccy: 0.0032789649712856354


      3996
ɪ     3685
s     3277
i     2340
n     1750
æ     1293
k      374
ɹ      277
d      264
t       55
w       33
z       17
ʔ       12
θ        6
ɔ        6
ɦ        1
dʒ       1
f        1
Name: predictions, dtype: int64



Now showing values for phoneme: ɹ
Accuraccy: 0.0028517626778838496


s     5228
ɪ     2761
ɹ     1522
æ     1305
i      831
       587
z      270
k      268
n      218
ɔ       74
ɦ       40
w       35
t       24
d       13
ʔ        7
θ        5
l        3
dʒ       1
Name: predictions, dtype: int64



Now showing values for phoneme: w
Accuraccy: 9.743210200391603e-05


s    2077
ɪ    1307
æ     881
ɹ     839
      682
i     651
k     411
n     174
z      84
w      52
ɔ      31
t      29
d      14
ʔ      13
ɦ      10
θ       6
l       1
Name: predictions, dtype: int64



Now showing values for phoneme: ɔ
Accuraccy: 0.0002323380893939536


s     7538
ɪ     3341
æ     1152
i     1071
ɹ      767
       429
z      393
n      218
k      124
ɔ      124
ɦ       37
t       17
w       10
d        6
θ        3
ʔ        1
l        1
dʒ       1
Name: predictions, dtype: int64



Now showing values for phoneme: ɚ
Accuraccy: 0.0


s    4364
ɪ    2692
æ    1054
i     941
ɹ     420
      414
n     260
k      98
z      90
ɔ      42
d       9
ɦ       7
t       6
ʔ       3
w       2
Name: predictions, dtype: int64



Now showing values for phoneme: l
Accuraccy: 7.494777077224309e-06


s    6177
ɪ    3800
æ    1272
i    1246
ɹ    1096
      426
n     290
k     214
z     192
ɔ      88
ɦ      25
d      20
t      20
w      12
ʔ       6
l       4
Name: predictions, dtype: int64



Now showing values for phoneme: j
Accuraccy: 0.0


s    1412
ɪ     919
æ     463
i     401
      271
ɹ     156
n     145
k     112
z      31
ɔ      16
t      15
d      13
w       6
ɦ       5
θ       3
l       1
Name: predictions, dtype: int64



Now showing values for phoneme: ʊ
Accuraccy: 0.0


s    712
ɪ    411
i    114
æ     81
ɹ     69
z     48
      23
n     19
k     10
ɔ      7
t      4
ɦ      3
f      1
d      1
w      1
Name: predictions, dtype: int64



Now showing values for phoneme: æ
Accuraccy: 0.004480002997910831


s    10647
ɪ     6157
æ     2391
i     2376
      1371
ɹ      943
n      645
z      500
k      279
ɔ      106
d       43
ɦ       31
t       30
w       26
ʔ       10
l        2
Name: predictions, dtype: int64



Now showing values for phoneme: ɔɪ
Accuraccy: 0.0


s    2614
ɪ     959
ɹ     348
æ     247
i     239
z     181
ɔ      66
       44
n      43
k      36
ɦ      21
t      15
w       3
d       1
Name: predictions, dtype: int64



Now showing values for phoneme: ə
Accuraccy: 0.0


s    2294
ɪ    1593
æ     280
i     237
ɹ     235
z     108
k      99
       79
n      73
ɔ      30
t      24
ɦ      16
d       9
w       4
ʔ       4
l       1
Name: predictions, dtype: int64



Now showing values for phoneme: g
Accuraccy: 0.0


     1808
k     508
n     228
t     223
d     174
i     113
ɪ     108
s      56
æ      32
ʔ       8
ɹ       7
p       2
θ       2
Name: predictions, dtype: int64



Now showing values for phoneme: θ
Accuraccy: 4.309496819403978e-05


     1384
s     987
ɪ     714
k     691
i     347
n     337
æ     237
t     184
d     140
ɹ     111
w      38
θ      23
ʔ      17
z      15
ɔ       7
ɦ       1
f       1
l       1
Name: predictions, dtype: int64



Now showing values for phoneme: t
Accuraccy: 0.004579308794184053


      7357
k     2583
t     2444
s     2401
ɪ     1288
n      403
d      374
i      277
æ      237
ɹ      140
ʔ       30
p       10
θ        8
w        7
z        5
ɔ        3
dʒ       2
ɦ        1
Name: predictions, dtype: int64



Now showing values for phoneme: i
Accuraccy: 0.005319418030559953


s    9319
ɪ    6272
i    2839
æ    2077
     1411
n     843
ɹ     789
z     423
k     223
ɔ      62
d      51
w      32
ɦ      31
t      24
θ       6
ʔ       5
f       1
l       1
Name: predictions, dtype: int64



Now showing values for phoneme: v
Accuraccy: 0.0


s    860
     855
ɪ    705
i    472
n    340
æ    244
ɹ    154
k    133
d     75
z     58
t     22
ɔ     20
w     17
ʔ     10
ɦ      9
θ      8
l      1
Name: predictions, dtype: int64



Now showing values for phoneme: f
Accuraccy: 3.7473885386121546e-06


s    4021
ɪ    1698
      983
i     816
æ     675
n     400
k     255
ɹ     149
z      64
d      43
t      29
w      12
ɔ      10
ʔ       2
f       2
ɦ       1
θ       1
Name: predictions, dtype: int64



Now showing values for phoneme: oʊ
Accuraccy: 0.0


s     5401
ɪ     2662
æ      960
i      922
ɹ      511
       325
z      196
n      141
k       75
ɔ       46
t       15
ɦ       15
d        7
w        5
l        1
dʒ       1
θ        1
Name: predictions, dtype: int64



Now showing values for phoneme: h
Accuraccy: 0.0


s    993
ɪ    355
     196
i    149
æ    148
k    100
ɹ     84
n     50
z     32
ɔ      9
t      5
d      4
w      4
ɦ      2
Name: predictions, dtype: int64



Now showing values for phoneme: tʃ
Accuraccy: 0.0


s    1764
ɪ     404
k     253
t     102
ɹ      73
æ      29
       22
i      21
z      11
n       6
w       3
ɔ       3
ɦ       2
θ       2
ʔ       1
Name: predictions, dtype: int64



Now showing values for phoneme: b
Accuraccy: 0.0


     2860
k     646
t     317
d     136
n     107
ɪ      58
i      42
s      35
ʔ      11
æ       4
p       4
ɹ       4
z       1
θ       1
Name: predictions, dtype: int64



Now showing values for phoneme: ɑ
Accuraccy: 0.0


s     7820
ɪ     3539
i     1091
æ     1057
ɹ      768
z      447
       392
n      220
ɔ      114
k      107
ɦ       33
t       18
d       10
w       10
dʒ       1
Name: predictions, dtype: int64



Now showing values for phoneme: m
Accuraccy: 0.0


      1790
ɪ     1643
s     1616
i     1088
æ      750
n      719
k      374
ɹ      181
d       91
t       37
w       21
ʔ       16
z        5
θ        4
ɔ        2
ɦ        2
dʒ       1
Name: predictions, dtype: int64



Now showing values for phoneme: aɪ
Accuraccy: 0.0


s    7570
ɪ    3221
æ    1111
i     974
ɹ     521
      493
z     403
n     204
ɔ     128
k      85
ɦ      34
t      16
d      13
w       5
ʔ       1
l       1
Name: predictions, dtype: int64



Now showing values for phoneme: eɪ
Accuraccy: 0.0


s    5301
ɪ    2898
i    1199
æ    1089
      620
ɹ     310
n     302
z     258
k      88
ɔ      37
ɦ      17
t      16
d      15
w       5
l       3
ʔ       1
f       1
Name: predictions, dtype: int64



Now showing values for phoneme: p
Accuraccy: 2.4358025500979007e-05


     4865
k    1638
t    1178
s     660
ɪ     539
æ     219
i     164
n     134
ɹ     131
d      87
ʔ      18
w      16
p      13
θ      11
z       7
ɦ       3
ɔ       2
l       1
Name: predictions, dtype: int64



Now showing values for phoneme: aʊ
Accuraccy: 0.0


s    2426
ɪ    1128
æ     511
i     446
      216
ɹ     142
n     100
z      88
k      42
d       9
ɔ       9
t       3
w       3
ɦ       1
Name: predictions, dtype: int64



Now showing values for phoneme: ɝ
Accuraccy: 0.0


s    4102
ɪ    2493
æ    1123
i     978
ɹ     441
      354
n     186
z     104
k      74
ɔ      35
t      13
d       9
w       9
ɦ       8
ʔ       1
l       1
Name: predictions, dtype: int64



Now showing values for phoneme: z
Accuraccy: 0.002877994397654135


s    7426
ɪ    1704
z    1536
ɹ     369
i     205
k     172
æ     153
ɦ     128
ɔ      74
n      69
       32
t      22
d       9
w       4
ʔ       3
θ       2
Name: predictions, dtype: int64



Now showing values for phoneme: ɡ
Accuraccy: 0.0


k     393
      205
æ     104
ɪ      88
t      71
i      63
s      62
ɹ      36
n      27
w      13
d       8
ʔ       8
θ       4
z       1
dʒ      1
Name: predictions, dtype: int64



Now showing values for phoneme: ʒ
Accuraccy: 0.0


s    285
z     88
ɪ     44
ɹ     12
ɦ     10
æ      8
ɔ      5
i      4
k      2
n      1
d      1
       1
Name: predictions, dtype: int64

Well, we can see, that our model works only on paper, because if we focus only on "important" predictions, the accuraccies are really low and almost nothing saying.

Adding scaler (as values should be in range (-1,1) or here (0,1))

In [37]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()

train_set[["previous_packet", "next_packet", "packet_size"]] = scaler.fit_transform(train_set[["previous_packet", "next_packet", "packet_size"]])
test_set[["previous_packet", "next_packet", "packet_size"]] = scaler.transform(test_set[["previous_packet", "next_packet", "packet_size"]])
test_set

Unnamed: 0,previous_packet,packet_size,next_packet
0,0.000000,0.217391,0.343137
1,0.294118,0.271739,0.421569
2,0.343137,0.358696,0.254902
3,0.421569,0.173913,0.294118
4,0.254902,0.217391,0.303922
...,...,...,...
195608,0.392157,0.391304,0.421569
195609,0.450980,0.358696,0.401961
195610,0.421569,0.336957,0.333333
195611,0.401961,0.260870,0.323529


In [38]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_phonemes, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 512)               2048      
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_5 (Dense)              (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________


In [39]:
model.fit(train_set, train_labels, epochs=16, batch_size=256)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7efd2c0bea90>

In [40]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [2.7797038555145264, 0.25186464190483093]


### LSTM

In [41]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [26]:
# model_lstm = Sequential()
# model_lstm.add(LSTM(256, input_shape = (1, 3)))
# model_lstm.add(Dense(units=total_unique_words))
# model_lstm.compile(loss='categorical_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy']
#              )

# model_lstm.summary()

In [42]:
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import SpatialDropout1D

In [28]:
#more elaborate model
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_phonemes, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               266240    
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 43)                11051     
Total params: 343,083
Trainable params: 343,083
Non-trainable params: 0
_________________________________________________________________


In [29]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)

single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)

In [30]:
train_set, train_labels = get_labels(single_phonemes_train, label=['phonemes'])
test_set, test_labels = get_labels(single_phonemes_test, label=['phonemes'])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=['phonemes'])

  return f(*args, **kwargs)


In [31]:
from keras.utils import to_categorical

train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(533705, 43)


In [32]:
reshaped_values = train_set.values.reshape(-1, 1, 3)
reshaped_values[0][0]

array([ 0, 32, 32])

In [33]:
model_lstm.fit(reshaped_values, train_labels, epochs=16, batch_size=128)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7f8ca99835c0>

In [34]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 3), test_labels))

test loss, test acc: [4.356447696685791, 0.17691053450107574]


In [35]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()

train_set[["previous_packet", "next_packet", "packet_size"]] = scaler.fit_transform(train_set[["previous_packet", "next_packet", "packet_size"]])
test_set[["previous_packet", "next_packet", "packet_size"]] = scaler.transform(test_set[["previous_packet", "next_packet", "packet_size"]])
test_set

Unnamed: 0,previous_packet,packet_size,next_packet
0,0.000000,0.217391,0.343137
1,0.294118,0.271739,0.421569
2,0.343137,0.358696,0.254902
3,0.421569,0.173913,0.294118
4,0.254902,0.217391,0.303922
...,...,...,...
195608,0.392157,0.391304,0.421569
195609,0.450980,0.358696,0.401961
195610,0.421569,0.336957,0.333333
195611,0.401961,0.260870,0.323529


In [36]:
#more elaborate model
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_phonemes, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 256)               266240    
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 43)                11051     
Total params: 343,083
Trainable params: 343,083
Non-trainable params: 0
_________________________________________________________________


In [37]:
reshaped_values = train_set.values.reshape(-1, 1, 3)
reshaped_values[0][0]

array([0.        , 0.23913043, 0.31372549])

In [38]:
model_lstm.fit(reshaped_values, train_labels, epochs=16, batch_size=128)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7f8ca9bc4f28>

In [39]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 3), test_labels))

test loss, test acc: [4.750559329986572, 0.17720703780651093]


### KFold crossvalidation:

In [49]:
def create_model(output_size):
    model = Sequential()

    model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
    model.add(Dense(units=256, activation='relu'))  # second hidden layer
    model.add(Dense(units=output_size, activation='softmax'))  # output layer

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.summary()
    
    return model

In [50]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)

single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)

In [51]:
train_set, train_labels = get_labels(single_phonemes_train, label=['phonemes'])
test_set, test_labels = get_labels(single_phonemes_test, label=['phonemes'])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=['phonemes'])

  return f(*args, **kwargs)


In [52]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()

train_set[["previous_packet", "next_packet", "packet_size"]] = scaler.fit_transform(train_set[["previous_packet", "next_packet", "packet_size"]])
test_set[["previous_packet", "next_packet", "packet_size"]] = scaler.transform(test_set[["previous_packet", "next_packet", "packet_size"]])

In [53]:
from keras.utils import to_categorical

train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(533705, 43)


In [54]:
dialects = pd.unique(skype_data_train.dialect)

results = []

for dialect in dialects:
    print("\n\nNow validating on dialect:", dialect)
    
    set_train = train_set.loc[single_phonemes_train["dialect"] != dialect]
    label_train = train_labels[single_phonemes_train["dialect"] != dialect]
    
    validation_set = train_set.loc[single_phonemes_train["dialect"] == dialect]
    validation_labels = train_labels[single_phonemes_train["dialect"] == dialect]
    
    model = create_model(total_unique_phonemes)
    
    display(model.fit(set_train, label_train, epochs=32, batch_size=256))
    
    result = model.evaluate(validation_set, validation_labels)
    results.extend([result])
    
    print("test loss, test acc:", result)
print("\nDone!")

print(results)



Now validating on dialect: DR1
Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_30 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_31 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_32 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7efd2c192198>

test loss, test acc: [2.742433547973633, 0.2659114599227905]


Now validating on dialect: DR2
Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_33 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_34 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_35 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7efd2c3234a8>

test loss, test acc: [2.740063190460205, 0.26007410883903503]


Now validating on dialect: DR3
Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_36 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_37 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_38 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7efd2c3de780>

test loss, test acc: [2.7439026832580566, 0.2607249617576599]


Now validating on dialect: DR4
Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_39 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_40 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_41 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7efd2c3f3898>

test loss, test acc: [2.7913384437561035, 0.24482445418834686]


Now validating on dialect: DR5
Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_42 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_43 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_44 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7efd2c481470>

test loss, test acc: [2.771883487701416, 0.25521525740623474]


Now validating on dialect: DR6
Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_45 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_46 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_47 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7efd2c515e48>

test loss, test acc: [2.7528064250946045, 0.2638198137283325]


Now validating on dialect: DR7
Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_48 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_49 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_50 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7efd2c5aafd0>

test loss, test acc: [2.726121187210083, 0.263571172952652]


Now validating on dialect: DR8
Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_51 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_52 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_53 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7efd2c5e3c88>

test loss, test acc: [2.729466438293457, 0.2649991512298584]

Done!
[[2.742433547973633, 0.2659114599227905], [2.740063190460205, 0.26007410883903503], [2.7439026832580566, 0.2607249617576599], [2.7913384437561035, 0.24482445418834686], [2.771883487701416, 0.25521525740623474], [2.7528064250946045, 0.2638198137283325], [2.726121187210083, 0.263571172952652], [2.729466438293457, 0.2649991512298584]]
