In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import GridSearchCV

sns.set()  # make plots nicer

np.random.seed(42)  # set seed for reproducibility

In [2]:
def file_parser_with_prev_next(path):
    file = open(path, 'r')
    lines = file.readlines()
    
    file_name = [path.split('/')[-1]]
    sentence = ""
    file_data = []
    
    has_value = False
    previous = 0
    
    for line in lines:
        line = line.strip()
        
        # if there are only 2 informations on line and second is h#, then ignore
        # strip line, split primarly on ; secondary on ,
        if (line.startswith('#')):
            if (not sentence):
                sentence = line[len('# Sentence: "'): len(line) - 1]
            continue
        
        line = line.split(';')
        
        if (len(line) == 1):
            #lines containing only their packet size and nothing else, they should be added
            #TODO
            line += [""]
            line += [""]
            #continue
        
        if (len(line) == 2):
            #this tries to remove most of the silence at the start of the recording
            #potentionally harmfull as we shouldn't clean test data this way (we will be reading labels)
            #if (line[1] == 'h#'):
            #    continue
            line += [""]
        
        line[1] = tuple(line[1].split(','))
        line[2] = tuple(list(map(lambda a: a.strip('"'), line[2].split(','))))
        
        if (has_value):
            file_data[-1][-4] = line[0]
           
        # file_type and sentence contain duplicate informations, but are kept for readability
        split_filename = file_name[0].split('-')
        
        line = file_name + [split_filename[0]] + [split_filename[1]] + [split_filename[2][0:-4]] + [sentence] + [previous] + [0] + line
        #adding previous as feature
        previous = line[-3]
        file_data += [line]
        
        #adding next frame as feature
        has_value = True
        
    return pd.DataFrame(file_data, columns=['file', 'dialect', 'speaker', 'sentence_id', 'sentence', 'previous_packet', 'next_packet','packet_size', 'phonemes', 'words'])

def load_files_with_prev_next(directory):
    filelist = os.listdir(directory)
    #read them into pandas
    df_list = [file_parser_with_prev_next(directory+file) for file in filelist]
    #concatenate them together
    return pd.concat(df_list, ignore_index=True)

def convert_types(data_frame):
    data_frame['packet_size'] = pd.to_numeric(data_frame['packet_size'])
    data_frame['previous_packet'] = pd.to_numeric(data_frame['previous_packet'])
    data_frame['next_packet'] = pd.to_numeric(data_frame['next_packet'])

    data_frame['file'] = data_frame['file'].astype('category')
    data_frame['sentence'] = data_frame['sentence'].astype('category')
    
    data_frame['dialect'] = data_frame['dialect'].astype('category')
    data_frame['speaker'] = data_frame['speaker'].astype('category')
    data_frame['sentence_id'] = data_frame['sentence_id'].astype('category')

In [3]:
skype_data_train = load_files_with_prev_next("./../data/skype_train_data/")
skype_data_test = load_files_with_prev_next("./../data/skype_test_data/")
convert_types(skype_data_train)
convert_types(skype_data_test)
skype_data_test

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,phonemes,words
0,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,0,35,30,"(h#,)","(,)"
1,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,30,43,35,"(h#,)","(,)"
2,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,35,26,43,"(h#,)","(,)"
3,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,43,30,26,"(h#,)","(,)"
4,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,26,31,30,"(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...
258516,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,40,43,46,"(h#,)","(,)"
258517,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,46,41,43,"(h#,)","(,)"
258518,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,43,34,41,"(h#,)","(,)"
258519,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,41,33,34,"(h#,)","(,)"


In [4]:
def add_surrounding(data_frame):
    data_frame['prev_curr'] = list(zip(data_frame.previous_packet, data_frame.packet_size))
    data_frame['next_curr'] = list(zip(data_frame.next_packet, data_frame.packet_size))
    data_frame['packet_surrounding'] = list(zip(data_frame.previous_packet, data_frame.packet_size, data_frame.next_packet))
    
    #data_frame['prev_curr'] = data_frame['prev_curr'].astype('category')
    #data_frame['next_curr'] = data_frame['next_curr'].astype('category')
    #data_frame['packet_surrounding'] = data_frame['packet_surrounding'].astype('category')

add_surrounding(skype_data_train)
add_surrounding(skype_data_test)

skype_data_train = skype_data_train[['file', 'dialect', 'speaker', 'sentence_id', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
skype_data_test = skype_data_test[['file', 'dialect', 'speaker', 'sentence_id', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
skype_data_train

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,0,32,32,"(0, 32)","(32, 32)","(0, 32, 32)","(h#,)","(,)"
1,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,31,32,"(32, 32)","(31, 32)","(32, 32, 31)","(h#,)","(,)"
2,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,28,31,"(32, 31)","(28, 31)","(32, 31, 28)","(h#,)","(,)"
3,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,31,28,28,"(31, 28)","(28, 28)","(31, 28, 28)","(h#,)","(,)"
4,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,28,36,28,"(28, 28)","(36, 28)","(28, 28, 36)","(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
707433,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,47,34,32,"(47, 32)","(34, 32)","(47, 32, 34)","(h#,)","(,)"
707434,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,32,39,34,"(32, 34)","(39, 34)","(32, 34, 39)","(h#,)","(,)"
707435,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,34,33,39,"(34, 39)","(33, 39)","(34, 39, 33)","(h#,)","(,)"
707436,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,39,36,33,"(39, 33)","(36, 33)","(39, 33, 36)","(h#,)","(,)"


### Data preparation

In [5]:
skype_data_train.loc[:, ["previous_packet", "packet_size", "next_packet"]]

Unnamed: 0,previous_packet,packet_size,next_packet
0,0,32,32
1,32,32,31
2,32,31,28
3,31,28,28
4,28,28,36
...,...,...,...
707433,47,32,34
707434,32,34,39
707435,34,39,33
707436,39,33,36


In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [7]:
# add removal of labels for the test_dataset
def get_labels(df, label=["words"], feature=["previous_packet", "packet_size", "next_packet"]):
    labels = df.loc[:, label]
    features = df.loc[:, feature]
    return features, labels

In [8]:
def prepare_labels(train_labels, test_labels, label=["words"]):
    train_labels = train_labels.astype('category')
    test_labels = test_labels.astype('category')
    
    total_labels = train_labels.append(test_labels)
    
    lab_enc = LabelEncoder()
    lab_enc.fit(total_labels[label])

    train_labels = lab_enc.transform(train_labels[label])
    test_labels = lab_enc.transform(test_labels[label])
    
    return train_labels, test_labels, lab_enc

### Additional preprocessing

In [9]:
# there are no "empty" phonemes
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_train

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,0,32,32,"(0, 32)","(32, 32)","(0, 32, 32)","(h#,)","(,)"
1,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,31,32,"(32, 32)","(31, 32)","(32, 32, 31)","(h#,)","(,)"
2,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,28,31,"(32, 31)","(28, 31)","(32, 31, 28)","(h#,)","(,)"
3,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,31,28,28,"(31, 28)","(28, 28)","(31, 28, 28)","(h#,)","(,)"
4,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,28,36,28,"(28, 28)","(36, 28)","(28, 28, 36)","(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
533700,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,47,34,32,"(47, 32)","(34, 32)","(47, 32, 34)","(h#,)","(,)"
533701,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,32,39,34,"(32, 34)","(39, 34)","(32, 34, 39)","(h#,)","(,)"
533702,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,34,33,39,"(34, 39)","(33, 39)","(34, 39, 33)","(h#,)","(,)"
533703,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,39,36,33,"(39, 33)","(36, 33)","(39, 33, 36)","(h#,)","(,)"


In [10]:
len(pd.unique(single_phonemes_train.phonemes))

62

In [11]:
pd.unique(single_phonemes_train.phonemes)

array([('h#',), ('sh',), ('ix',), ('hv',), ('eh',), ('jh',), ('ih',),
       ('dcl',), ('ah',), ('kcl',), ('k',), ('s',), ('ux',), ('q',),
       ('en',), ('r',), ('w',), ('ao',), ('axr',), ('l',), ('y',),
       ('uh',), ('n',), ('ae',), ('dx',), ('oy',), ('ax',), ('gcl',),
       ('dh',), ('tcl',), ('iy',), ('v',), ('t',), ('f',), ('ow',),
       ('d',), ('hh',), ('ch',), ('bcl',), ('aa',), ('em',), ('ng',),
       ('m',), ('ay',), ('th',), ('ax-h',), ('ey',), ('p',), ('pcl',),
       ('aw',), ('er',), ('z',), ('epi',), ('el',), ('uw',), ('g',),
       ('',), ('b',), ('pau',), ('zh',), ('nx',), ('eng',)], dtype=object)

Taken from: https://github.com/jhasegaw/phonecodes/blob/master/src/phonecode_tables.py

In [12]:
arpa_to_ipa = {
    'aa':'ɑ',
    'ae':'æ',
    'ah':'ʌ',
    'ah0':'ə',
    'ao':'ɔ',
    'aw':'aʊ',
    'ay':'aɪ',
    'eh':'ɛ',
    'er':'ɝ',
    'er0':'ɚ',
    'ey':'eɪ',
    'ih':'ɪ',
    'ih0':'ɨ',
    'iy':'i',
    'ow':'oʊ',
    'oy':'ɔɪ',
    'uh':'ʊ',
    'uw':'u',
    'b':'b',
    'ch':'tʃ',
    'd':'d',
    'dh':'ð',
    'el':'l̩',
    'em':'m̩',
    'en':'n̩',
    'f':'f',
    'g':'ɡ',
    'hh':'h',
    'jh':'dʒ',
    'k':'k',
    'l':'l',
    'm':'m',
    'n':'n',
    'ng':'ŋ',
    'p':'p',
    'q':'ʔ',
    'r':'ɹ',
    's':'s',
    'sh':'ʃ',
    't':'t',
    'th':'θ',
    'v':'v',
    'w':'w',
    'wh':'ʍ',
    'y':'j',
    'z':'z',
    'zh':'ʒ',

    'ax':'ə',
    'ax-h':'ə̥',
    'axr':'ɚ',
    'bcl':'b',
    'dcl':'d',
    'dx':'ɾ',
    'eng':'ŋ̍',
    'epi':'',
    'gcl':'g',
    'hv':'ɦ',
    'h#':'',
    'ix':'ɨ',
    'kcl':'k',
    'nx':'ɾ̃',
    'pau':'',
    'pcl':'p',
    'tcl':'t',
    'ux':'ʉ',
    '':'',
}

In [13]:
len(arpa_to_ipa)

66

This modification is based on this: https://en.wikipedia.org/wiki/ARPABET (+ minor guessing)

In [14]:
ipa_allophone = {
    'ŋ̍':'n', #should be ŋ
    'ə̥':'ɛ',
    'ɨ':'ɪ',
    'n̩':'n',
    'm̩':'m',
    'ŋ':'n',
    'ɾ̃':'n',
    'ð':'θ',
    'ʉ':'u',
    'ɾ':'d',
    'l̩':'l',
}

In [15]:
uniq_phon = pd.unique(single_phonemes_train.phonemes)
for i in range(len(uniq_phon)):
    uniq_phon[i] = arpa_to_ipa.get(uniq_phon[i][0], uniq_phon[i][0])
    uniq_phon[i] = ipa_allophone.get(uniq_phon[i], uniq_phon[i])
    
uniq_phon

array(['', 'ʃ', 'ɪ', 'ɦ', 'ɛ', 'dʒ', 'ɪ', 'd', 'ʌ', 'k', 'k', 's', 'u',
       'ʔ', 'n', 'ɹ', 'w', 'ɔ', 'ɚ', 'l', 'j', 'ʊ', 'n', 'æ', 'd', 'ɔɪ',
       'ə', 'g', 'θ', 't', 'i', 'v', 't', 'f', 'oʊ', 'd', 'h', 'tʃ', 'b',
       'ɑ', 'm', 'n', 'm', 'aɪ', 'θ', 'ɛ', 'eɪ', 'p', 'p', 'aʊ', 'ɝ', 'z',
       '', 'l', 'u', 'ɡ', '', 'b', '', 'ʒ', 'n', 'n'], dtype=object)

In [16]:
len(np.unique(uniq_phon))

43

Now modifying our input dataset:

In [17]:
#input is expected to be a tuple
def convert_phoneme(phoneme):
    tmp_1 = arpa_to_ipa.get(phoneme[0], phoneme[0])
    tmp_2 = ipa_allophone.get(tmp_1, tmp_1)
    return tmp_2

In [18]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,0,35,30,"(0, 30)","(35, 30)","(0, 30, 35)","(h#,)","(,)"
1,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,30,43,35,"(30, 35)","(43, 35)","(30, 35, 43)","(h#,)","(,)"
2,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,35,26,43,"(35, 43)","(26, 43)","(35, 43, 26)","(h#,)","(,)"
3,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,43,30,26,"(43, 26)","(30, 26)","(43, 26, 30)","(h#,)","(,)"
4,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,26,31,30,"(26, 30)","(31, 30)","(26, 30, 31)","(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195608,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,40,43,46,"(40, 46)","(43, 46)","(40, 46, 43)","(h#,)","(,)"
195609,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,46,41,43,"(46, 43)","(41, 43)","(46, 43, 41)","(h#,)","(,)"
195610,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,43,34,41,"(43, 41)","(34, 41)","(43, 41, 34)","(h#,)","(,)"
195611,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,41,33,34,"(41, 34)","(33, 34)","(41, 34, 33)","(h#,)","(,)"


In [19]:
single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)
single_phonemes_train

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,0,32,32,"(0, 32)","(32, 32)","(0, 32, 32)",,"(,)"
1,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,31,32,"(32, 32)","(31, 32)","(32, 32, 31)",,"(,)"
2,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,28,31,"(32, 31)","(28, 31)","(32, 31, 28)",,"(,)"
3,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,31,28,28,"(31, 28)","(28, 28)","(31, 28, 28)",,"(,)"
4,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,28,36,28,"(28, 28)","(36, 28)","(28, 28, 36)",,"(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
533700,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,47,34,32,"(47, 32)","(34, 32)","(47, 32, 34)",,"(,)"
533701,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,32,39,34,"(32, 34)","(39, 34)","(32, 34, 39)",,"(,)"
533702,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,34,33,39,"(34, 39)","(33, 39)","(34, 39, 33)",,"(,)"
533703,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,39,36,33,"(39, 33)","(36, 33)","(39, 33, 36)",,"(,)"


In [20]:
tmp = pd.unique(single_phonemes_test.phonemes)
print(len(tmp))
tmp

43


array(['', 'ʃ', 'i', 'ɦ', 'æ', 'd', 'ɝ', 'ɑ', 'ɹ', 'k', 's', 'u', 'ɪ',
       'n', 'g', 'ɡ', 'w', 'ʔ', 'ɔ', 'l', 'j', 'ɚ', 'oʊ', 't', 'ɛ', 'ɔɪ',
       'aɪ', 'θ', 'h', 'z', 'p', 'ə', 'b', 'f', 'v', 'm', 'aʊ', 'ʌ', 'eɪ',
       'tʃ', 'ʊ', 'dʒ', 'ʒ'], dtype=object)

In [21]:
train_set, train_labels = get_labels(single_phonemes_train, label=['phonemes'])
test_set, test_labels = get_labels(single_phonemes_test, label=['phonemes'])

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.phonemes)))
print(len(pd.unique(test_labels.phonemes)))
total_unique_phonemes = len(pd.unique(total_labels.phonemes))
total_unique_phonemes

43
43


43

In [22]:
train_set, train_labels = get_labels(single_phonemes_train)
test_set, test_labels = get_labels(single_phonemes_test)

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.words)))
print(len(pd.unique(test_labels.words)))
total_unique_words = len(pd.unique(total_labels.words))
total_unique_words

5104
2464


6387

### Merging long sequencess

In [23]:
tmp = single_phonemes_train[["file", "phonemes", "packet_surrounding"]][1:].reset_index(drop=True) != single_phonemes_train[["file", "phonemes", "packet_surrounding"]][:-1]
prev_not_same = (tmp.phonemes | tmp.packet_surrounding)
prev_not_same

0         True
1         True
2         True
3         True
4         True
          ... 
533699    True
533700    True
533701    True
533702    True
533703    True
Length: 533704, dtype: bool

In [24]:
prev_not_same.loc[-1] = True
prev_not_same.index = prev_not_same.index + 1
prev_not_same.sort_index(inplace=True)
prev_not_same

0         True
1         True
2         True
3         True
4         True
          ... 
533700    True
533701    True
533702    True
533703    True
533704    True
Length: 533705, dtype: bool

In [25]:
single_phonemes_train.loc[~prev_not_same.values]

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
3542,DR1-FECD0-SI1418.CSV,DR1,FECD0,SI1418,Personal predispositions tend to blunt the ear...,39,39,39,"(39, 39)","(39, 39)","(39, 39, 39)",,"(,)"
5220,DR1-FETB0-SX248.CSV,DR1,FETB0,SX248,Reading in poor light gives you eyestrain.,33,33,33,"(33, 33)","(33, 33)","(33, 33, 33)",,"(,)"
6012,DR1-FJSP0-SI1763.CSV,DR1,FJSP0,SI1763,That's your headache.,42,42,42,"(42, 42)","(42, 42)","(42, 42, 42)",,"(,)"
6559,DR1-FJSP0-SX444.CSV,DR1,FJSP0,SX444,The toddler found a clamshell near the camp site.,42,42,42,"(42, 42)","(42, 42)","(42, 42, 42)",,"(,)"
8242,DR1-FMEM0-SA2.CSV,DR1,FMEM0,SA2,Don't ask me to carry an oily rag like that.,30,30,30,"(30, 30)","(30, 30)","(30, 30, 30)",,"(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
515430,DR8-FMBG0-SA1.CSV,DR8,FMBG0,SA1,She had your dark suit in greasy wash water al...,30,30,30,"(30, 30)","(30, 30)","(30, 30, 30)",,"(,)"
518926,DR8-MBCG0-SI2217.CSV,DR8,MBCG0,SI2217,"He'd not only told me so, he'd proved it.",63,63,63,"(63, 63)","(63, 63)","(63, 63, 63)",i,"(he'd,)"
524746,DR8-MKRG0-SX31.CSV,DR8,MKRG0,SX31,A good attitude is unbeatable.,38,38,38,"(38, 38)","(38, 38)","(38, 38, 38)",,"(,)"
525211,DR8-MMEA0-SI2018.CSV,DR8,MMEA0,SI2018,They were shattered.,27,27,27,"(27, 27)","(27, 27)","(27, 27, 27)",,"(,)"


This is going to only remove around 100 values, which is literally nothing => this won't help us.

In [20]:
def cv_dialect_splitter():
    for dialect in np.unique(skype_data_train.dialect):
        yield (skype_data_train.index[skype_data_train["dialect"] != dialect],
               skype_data_train.index[skype_data_train["dialect"] == dialect])

## Models:

In [23]:
import keras
import tensorflow as tf

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import classification_report

In [24]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)

single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)

In [25]:
train_set, train_labels = get_labels(single_phonemes_train, label=['phonemes'])
test_set, test_labels = get_labels(single_phonemes_test, label=['phonemes'])

train_labels, test_labels, encoder = prepare_labels(train_labels, test_labels, label=['phonemes'])

  return f(*args, **kwargs)


In [26]:
print(train_set.shape)
print(train_labels.shape)
train_labels

(533705, 3)
(533705,)


array([0, 0, 0, ..., 0, 0, 0])

In [27]:
encoder.inverse_transform(train_labels)

array(['', '', '', ..., '', '', ''], dtype=object)

In [28]:
train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(533705, 43)


In [29]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_phonemes, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.fit(train_set, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7f65894ffa90>

In [31]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [2.7967820167541504, 0.24903763830661774]


In [32]:
print("train loss, train acc:", model.evaluate(train_set, train_labels))

train loss, train acc: [2.7707388401031494, 0.25507161021232605]


#### Looking into predictions

In [35]:
predictions = model.predict(train_set)
print(predictions[0])
np.argmax(predictions[0])

[1.00000000e+00 1.07542077e-15 3.50583390e-16 7.54777837e-12
 1.09861426e-11 4.91500946e-21 9.55265643e-18 4.03136536e-13
 3.27846626e-12 3.43993527e-19 3.20504403e-16 2.73501496e-17
 1.44555767e-09 1.49724753e-17 1.49449358e-14 2.32260337e-13
 6.07581117e-19 6.26654006e-11 3.93258141e-17 4.34822317e-10
 3.75515911e-17 4.56969722e-16 4.43366638e-16 2.51976388e-20
 3.75841050e-18 4.84780334e-15 8.54135319e-17 3.85227323e-15
 4.63416629e-20 4.69914887e-17 3.96586226e-16 3.11574293e-14
 1.31838031e-18 5.28811616e-16 1.06129007e-25 6.86194647e-15
 2.94472576e-18 7.81698707e-19 3.34474689e-28 2.98764469e-17
 1.85790032e-28 2.54921388e-12 3.53677157e-11]


0

In [66]:
predictions_array = [np.argmax(x) for x in predictions]
#encoder.inverse_transform(predictions_array).value_counts()
np.asarray((np.unique(encoder.inverse_transform(predictions_array), return_counts=True))).T

array([['', 145336],
       ['d', 5059],
       ['dʒ', 146],
       ['f', 28],
       ['i', 25854],
       ['k', 20251],
       ['n', 16699],
       ['p', 505],
       ['s', 174040],
       ['t', 2876],
       ['w', 1762],
       ['z', 7102],
       ['æ', 41961],
       ['ɑ', 160],
       ['ɛ', 41],
       ['ɦ', 57],
       ['ɪ', 82936],
       ['ɹ', 8414],
       ['ʔ', 221],
       ['θ', 257]], dtype=object)

In [37]:
comparison = single_phonemes_train.assign(predictions=encoder.inverse_transform(predictions_array))
compar = comparison[["phonemes", "predictions", "words"]]
compar

Unnamed: 0,phonemes,predictions,words
0,,,"(,)"
1,,,"(,)"
2,,,"(,)"
3,,,"(,)"
4,,,"(,)"
...,...,...,...
533700,,,"(,)"
533701,,,"(,)"
533702,,,"(,)"
533703,,,"(,)"


In [38]:
for phoneme in pd.unique(compar.phonemes):
    tmp = compar.loc[comparison.phonemes == phoneme]
    print("\n\nNow showing values for phoneme:", phoneme)
    print("Accuraccy:", len(tmp.loc[tmp.phonemes == tmp.predictions])/len(tmp))
    display(tmp.predictions.value_counts())



Now showing values for phoneme: 
Accuraccy: 0.9269229119299901


     86430
n     1366
s     1260
æ     1142
k     1014
ɪ      732
i      710
d      418
t       95
p       25
w       25
ɹ       10
θ        9
ʔ        8
Name: predictions, dtype: int64



Now showing values for phoneme: ʃ
Accuraccy: 0.0


s     8894
ɪ      868
z      182
æ      181
k      154
i      144
n      109
ɹ       56
        21
t       14
w        7
dʒ       1
d        1
ʔ        1
ɑ        1
Name: predictions, dtype: int64



Now showing values for phoneme: ɪ
Accuraccy: 0.2790738839522781


s     11864
ɪ      7883
æ      2759
i      2080
       1088
n       994
ɹ       638
z       399
k       329
w        83
d        51
t        41
ɑ        10
θ        10
dʒ        7
ɛ         5
ʔ         3
ɦ         3
Name: predictions, dtype: int64



Now showing values for phoneme: ɦ
Accuraccy: 0.0018214936247723133


s     1299
ɪ      503
z      457
ɹ      167
æ      143
i       64
        52
n       21
w       13
k       11
ɑ        8
ɦ        5
dʒ       1
d        1
Name: predictions, dtype: int64



Now showing values for phoneme: ɛ
Accuraccy: 0.00014378145219266715


s     6524
ɪ     3436
æ     1318
i     1033
       505
n      407
z      259
ɹ      242
k       89
w       42
t       20
d       16
ɑ        6
dʒ       4
ɦ        2
ɛ        2
ʔ        2
θ        2
f        1
Name: predictions, dtype: int64



Now showing values for phoneme: dʒ
Accuraccy: 0.003954305799648506


s     1165
k      363
ɪ      344
t       86
ɹ       76
z       74
æ       60
w       34
        27
i       21
n       11
dʒ       9
ʔ        4
θ        1
ɦ        1
Name: predictions, dtype: int64



Now showing values for phoneme: d
Accuraccy: 0.10005109862033724


      4927
k     1584
d      979
ɪ      596
s      477
n      445
t      236
i      195
æ      191
ɹ       67
w       25
p       18
ʔ       17
θ       12
z        8
ɑ        3
dʒ       3
ɛ        1
ɦ        1
Name: predictions, dtype: int64



Now showing values for phoneme: ʌ
Accuraccy: 0.0


s     3731
ɪ     2018
æ      730
i      520
z      239
       199
ɹ      198
n      148
k       66
w       25
d        8
ɑ        8
t        6
dʒ       5
ɦ        3
ʔ        3
f        2
Name: predictions, dtype: int64



Now showing values for phoneme: k
Accuraccy: 0.23513822490126793


      7874
k     4525
s     2293
ɪ     1455
æ      758
d      723
t      591
n      404
i      293
p      128
ɹ       80
w       78
ʔ       18
θ       10
dʒ       8
z        3
ɛ        2
ɑ        1
Name: predictions, dtype: int64



Now showing values for phoneme: s
Accuraccy: 0.7797886202403359


s     26930
ɪ      4171
æ      1165
i       640
z       484
k       389
n       309
ɹ       209
        125
t        59
w        34
dʒ        8
ɑ         3
θ         2
ɦ         2
d         2
ɛ         1
ʔ         1
f         1
Name: predictions, dtype: int64



Now showing values for phoneme: u
Accuraccy: 0.0


s    3228
ɪ    2434
æ    1748
     1337
i    1083
n     570
ɹ     161
k      59
z      48
w      32
d      17
θ      10
ʔ       5
t       4
ɛ       2
ɦ       1
f       1
Name: predictions, dtype: int64



Now showing values for phoneme: ʔ
Accuraccy: 0.003963830050786573


      2688
s     1105
ɪ     1023
k      904
æ      669
n      534
i      391
d      339
t      128
ɹ      113
w       77
ʔ       32
z       27
θ       25
p       11
ɑ        2
ɛ        2
f        2
dʒ       1
Name: predictions, dtype: int64



Now showing values for phoneme: n
Accuraccy: 0.11617207269381183


      4768
ɪ     3176
s     2880
n     2020
æ     1860
i     1798
d      325
k      274
ɹ      132
w       78
θ       31
t       19
z       12
ʔ        7
f        4
dʒ       3
ɑ        1
Name: predictions, dtype: int64



Now showing values for phoneme: ɹ
Accuraccy: 0.07102789569436022


s     5078
ɪ     3015
æ     1883
ɹ      937
       753
i      568
n      294
z      270
k      192
w      144
d       17
t       10
dʒ      10
ɑ        8
ʔ        7
ɦ        4
θ        2
Name: predictions, dtype: int64



Now showing values for phoneme: w
Accuraccy: 0.022445607270724317


s     1934
ɪ     1394
æ     1350
       836
ɹ      499
i      414
k      317
n      212
w      163
z       81
d       23
ʔ       12
t       12
dʒ       6
ɑ        5
θ        3
p        1
Name: predictions, dtype: int64



Now showing values for phoneme: ɔ
Accuraccy: 0.0


s     7278
ɪ     3469
æ     1734
i      888
       598
ɹ      416
n      350
z      327
k       95
w       40
t       12
ɑ        8
ɦ        4
dʒ       4
ʔ        4
d        4
θ        1
f        1
Name: predictions, dtype: int64



Now showing values for phoneme: ɚ
Accuraccy: 0.0


s     4110
ɪ     2542
æ     1535
i      806
       639
n      381
ɹ      207
z       75
k       68
w       13
d       12
ɑ        6
t        3
θ        2
ʔ        1
ɛ        1
dʒ       1
Name: predictions, dtype: int64



Now showing values for phoneme: l
Accuraccy: 0.0


s     5860
ɪ     3968
æ     1936
i      935
ɹ      663
       627
n      433
z      168
k      164
w       71
d       25
t       12
ɑ       10
θ        5
dʒ       4
ʔ        3
ɦ        2
ɛ        2
Name: predictions, dtype: int64



Now showing values for phoneme: j
Accuraccy: 0.0


s     1345
ɪ      852
æ      663
       345
i      331
n      204
k       83
ɹ       78
z       24
w       15
d       14
t        9
dʒ       2
ɑ        2
ʔ        1
p        1
Name: predictions, dtype: int64



Now showing values for phoneme: ʊ
Accuraccy: 0.0


s    709
ɪ    419
æ    119
i     93
z     42
      38
ɹ     35
n     32
k     11
ɑ      2
w      2
t      1
f      1
Name: predictions, dtype: int64



Now showing values for phoneme: æ
Accuraccy: 0.13338811284579566


s     10188
ɪ      5870
æ      3409
i      2019
       1827
n       925
ɹ       517
z       423
k       206
w        84
d        50
t        14
θ         7
ɑ         6
ɛ         3
ʔ         3
ɦ         3
dʒ        3
Name: predictions, dtype: int64



Now showing values for phoneme: ɔɪ
Accuraccy: 0.0


s     2580
ɪ     1076
æ      401
i      210
ɹ      191
z      169
        67
n       63
k       31
w       15
ɑ        6
t        4
dʒ       2
ɦ        1
d        1
Name: predictions, dtype: int64



Now showing values for phoneme: ə
Accuraccy: 0.0


s     2320
ɪ     1464
æ      432
i      239
ɹ      134
n      133
       131
z       98
k       90
w       18
t        9
d        8
ɑ        4
dʒ       3
ɦ        2
ɛ        1
Name: predictions, dtype: int64



Now showing values for phoneme: g
Accuraccy: 0.0


     1910
k     478
d     334
n     213
i      90
ɪ      78
s      54
æ      48
t      37
p      15
θ       4
ɹ       4
w       4
Name: predictions, dtype: int64



Now showing values for phoneme: θ
Accuraccy: 0.004584527220630372


      1561
s      944
ɪ      689
k      644
æ      383
n      325
i      240
d      169
t       75
w       70
ɹ       58
θ       24
ʔ       24
z       15
p        7
ɑ        3
dʒ       2
ɛ        1
f        1
Name: predictions, dtype: int64



Now showing values for phoneme: t
Accuraccy: 0.0508252703471827


      7858
k     3355
s     2311
ɪ     1174
t      893
d      637
æ      442
n      399
i      255
p      121
ɹ       54
w       35
dʒ      18
θ        6
ʔ        5
z        5
ɛ        2
Name: predictions, dtype: int64



Now showing values for phoneme: i
Accuraccy: 0.10164283665860953


s     8861
ɪ     5890
æ     2945
i     2481
      1876
n     1220
ɹ      441
z      357
k      152
w       85
d       51
θ       13
t       13
ɛ        5
ɦ        5
ɑ        4
f        4
ʔ        3
dʒ       3
Name: predictions, dtype: int64



Now showing values for phoneme: v
Accuraccy: 0.0


     971
s    831
ɪ    667
æ    372
i    371
n    365
k    101
d     97
ɹ     84
z     58
w     35
θ     10
ʔ      7
t      7
ɛ      4
ɑ      2
f      1
Name: predictions, dtype: int64



Now showing values for phoneme: f
Accuraccy: 0.00032747516646654295


s    3831
ɪ    1499
     1158
æ    1046
i     668
n     530
k     206
ɹ      56
d      52
z      51
w      35
θ      11
t       8
ʔ       3
ɛ       3
f       3
ɑ       1
Name: predictions, dtype: int64



Now showing values for phoneme: oʊ
Accuraccy: 0.0


s     5199
ɪ     2654
æ     1372
i      783
       469
ɹ      281
n      269
z      143
k       54
w       36
d        7
t        6
dʒ       4
ɑ        3
θ        2
f        1
ɦ        1
Name: predictions, dtype: int64



Now showing values for phoneme: h
Accuraccy: 0.0


s    947
ɪ    392
     235
æ    229
i    101
k     73
n     64
ɹ     33
z     30
w     11
θ      5
d      5
t      4
ɑ      2
Name: predictions, dtype: int64



Now showing values for phoneme: tʃ
Accuraccy: 0.0


s     1779
ɪ      366
k      253
t       77
æ       64
i       42
ɹ       40
        26
n       18
w       14
z        9
dʒ       5
ʔ        2
ɛ        1
Name: predictions, dtype: int64



Now showing values for phoneme: b
Accuraccy: 0.0


      2935
k      708
d      295
n       98
ɪ       45
s       38
i       29
p       29
t       22
æ       11
θ        5
ɹ        4
ʔ        3
w        3
dʒ       1
Name: predictions, dtype: int64



Now showing values for phoneme: ɑ
Accuraccy: 0.000319959045242209


s     7514
ɪ     3661
æ     1601
i      991
       531
ɹ      426
z      375
n      372
k       75
w       43
d       13
t       11
dʒ       7
ɑ        5
θ        1
ɦ        1
Name: predictions, dtype: int64



Now showing values for phoneme: m
Accuraccy: 0.0


      2115
ɪ     1474
s     1432
æ     1046
n      853
i      776
k      311
d      115
ɹ       88
w       65
θ       27
ʔ       18
t       15
z        3
dʒ       1
ɦ        1
Name: predictions, dtype: int64



Now showing values for phoneme: aɪ
Accuraccy: 0.0


s     7434
ɪ     3202
æ     1563
i      848
       661
n      347
z      328
ɹ      276
k       58
w       20
ɑ       17
d       11
t        9
dʒ       3
ɦ        2
ʔ        1
Name: predictions, dtype: int64



Now showing values for phoneme: eɪ
Accuraccy: 0.0


s     5052
ɪ     2743
æ     1531
i     1029
       857
n      445
z      225
ɹ      147
k       56
w       26
d       18
θ        7
ɑ        7
t        6
dʒ       6
f        4
ɛ        1
Name: predictions, dtype: int64



Now showing values for phoneme: p
Accuraccy: 0.015279785257072062


      5137
k     2094
s      608
ɪ      542
æ      317
t      253
d      180
p      148
n      138
i      120
w       69
ɹ       57
ʔ       12
θ        5
z        4
ɑ        1
dʒ       1
Name: predictions, dtype: int64



Now showing values for phoneme: aʊ
Accuraccy: 0.0


s    2300
ɪ    1080
æ     719
i     374
      303
n     162
ɹ      71
z      61
k      32
w       9
d       8
ɛ       2
t       2
f       1
Name: predictions, dtype: int64



Now showing values for phoneme: ɝ
Accuraccy: 0.0


s     3828
ɪ     2407
æ     1601
i      838
       515
n      312
ɹ      223
z       92
k       56
w       35
d       12
t        5
ɑ        4
dʒ       2
ɦ        1
Name: predictions, dtype: int64



Now showing values for phoneme: z
Accuraccy: 0.11672824991602285


s     7669
ɪ     1547
z     1390
æ      318
i      296
ɹ      225
n      177
k      161
        54
w       19
t       12
d       11
ɦ       11
ɑ       10
dʒ       6
ʔ        2
Name: predictions, dtype: int64



Now showing values for phoneme: ɡ
Accuraccy: 0.0


k     365
      261
æ     158
ɪ      75
s      54
i      40
t      36
w      30
n      24
ɹ      14
d      10
ʔ       9
θ       5
dʒ      2
p       1
Name: predictions, dtype: int64



Now showing values for phoneme: ʒ
Accuraccy: 0.0


s    302
z     87
ɪ     43
æ      9
i      7
ɹ      6
n      3
k      1
ɦ      1
ɑ      1
       1
Name: predictions, dtype: int64

Well, we can see, that our model works only on paper, because if we focus only on "important" predictions, the accuraccies are really low and almost nothing saying.

Adding scaler (as values should be in range (-1,1) or here (0,1))

In [39]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()

train_set[["previous_packet", "next_packet", "packet_size"]] = scaler.fit_transform(train_set[["previous_packet", "next_packet", "packet_size"]])
test_set[["previous_packet", "next_packet", "packet_size"]] = scaler.transform(test_set[["previous_packet", "next_packet", "packet_size"]])
test_set

Unnamed: 0,previous_packet,packet_size,next_packet
0,0.000000,0.217391,0.343137
1,0.294118,0.271739,0.421569
2,0.343137,0.358696,0.254902
3,0.421569,0.173913,0.294118
4,0.254902,0.217391,0.303922
...,...,...,...
195608,0.392157,0.391304,0.421569
195609,0.450980,0.358696,0.401961
195610,0.421569,0.336957,0.333333
195611,0.401961,0.260870,0.323529


In [40]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_phonemes, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 512)               2048      
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_5 (Dense)              (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________


In [41]:
model.fit(train_set, train_labels, epochs=16, batch_size=256)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7f60d0a9d8d0>

In [42]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [2.774493932723999, 0.2530199885368347]


#### Words

In [43]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)

single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)

In [44]:
train_set, train_labels = get_labels(single_phonemes_train, label=['words'])
test_set, test_labels = get_labels(single_phonemes_test, label=['words'])

train_labels, test_labels, encoder = prepare_labels(train_labels, test_labels, label=['words'])

  return f(*args, **kwargs)


In [41]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_words, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 512)               2048      
_________________________________________________________________
dense_10 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_11 (Dense)             (None, 6387)              1641459   
Total params: 1,774,835
Trainable params: 1,774,835
Non-trainable params: 0
_________________________________________________________________


In [45]:
train_labels = to_categorical(train_labels, num_classes=total_unique_words)
test_labels = to_categorical(test_labels, num_classes=total_unique_words)
print(train_labels.shape)

(533705, 6387)


In [46]:
model.fit(train_set, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7f65895d2a90>

In [47]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [10.04415512084961, 0.1853608936071396]


In [48]:
print("train loss, train acc:", model.evaluate(train_set, train_labels))

train loss, train acc: [5.892178535461426, 0.18834562599658966]


### KFold crossvalidation - FF_NN:

In [58]:
def create_model(output_size):
    model = Sequential()

    model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
    model.add(Dense(units=256, activation='relu'))  # second hidden layer
    model.add(Dense(units=output_size, activation='softmax'))  # output layer

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.summary()
    
    return model

In [59]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)

single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)

In [60]:
train_set, train_labels = get_labels(single_phonemes_train, label=['phonemes'])
test_set, test_labels = get_labels(single_phonemes_test, label=['phonemes'])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=['phonemes'])

  return f(*args, **kwargs)


In [61]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()

train_set[["previous_packet", "next_packet", "packet_size"]] = scaler.fit_transform(train_set[["previous_packet", "next_packet", "packet_size"]])
test_set[["previous_packet", "next_packet", "packet_size"]] = scaler.transform(test_set[["previous_packet", "next_packet", "packet_size"]])

In [62]:
from keras.utils import to_categorical

train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(533705, 43)


In [63]:
dialects = pd.unique(skype_data_train.dialect)

results = []

for dialect in dialects:
    print("\n\nNow validating on dialect:", dialect)
    
    set_train = train_set.loc[single_phonemes_train["dialect"] != dialect]
    label_train = train_labels[single_phonemes_train["dialect"] != dialect]
    
    validation_set = train_set.loc[single_phonemes_train["dialect"] == dialect]
    validation_labels = train_labels[single_phonemes_train["dialect"] == dialect]
    
    model = create_model(total_unique_phonemes)
    
    display(model.fit(set_train, label_train, epochs=32, batch_size=256))
    
    result = model.evaluate(validation_set, validation_labels)
    results.extend([result]) # remake to a dictionary
    
    print("test loss, test acc:", result)
print("\nDone!")

print(results)



Now validating on dialect: DR1
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_11 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_12 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 

<tensorflow.python.keras.callbacks.History at 0x7f60d0f374a8>

test loss, test acc: [2.7368481159210205, 0.26732170581817627]


Now validating on dialect: DR2
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_14 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_15 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epo

<tensorflow.python.keras.callbacks.History at 0x7f60d0fcba58>

test loss, test acc: [2.739569664001465, 0.25986436009407043]


Now validating on dialect: DR3
Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_17 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_18 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoc

<tensorflow.python.keras.callbacks.History at 0x7f60d0fd7860>

test loss, test acc: [2.7429392337799072, 0.25973647832870483]


Now validating on dialect: DR4
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_20 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_21 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epo

<tensorflow.python.keras.callbacks.History at 0x7f60d106b780>

test loss, test acc: [2.787505626678467, 0.2458427995443344]


Now validating on dialect: DR5
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_23 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_24 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch

<tensorflow.python.keras.callbacks.History at 0x7f60d10f9ac8>

test loss, test acc: [2.7651100158691406, 0.25588634610176086]


Now validating on dialect: DR6
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_25 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_26 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_27 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epo

<tensorflow.python.keras.callbacks.History at 0x7f60d10759e8>

test loss, test acc: [2.7461373805999756, 0.2648336887359619]


Now validating on dialect: DR7
Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_28 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_29 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_30 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epo

<tensorflow.python.keras.callbacks.History at 0x7f60d103f748>

test loss, test acc: [2.724114418029785, 0.26456424593925476]


Now validating on dialect: DR8
Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_31 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_32 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_33 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epo

<tensorflow.python.keras.callbacks.History at 0x7f60d0fbf3c8>

test loss, test acc: [2.7261714935302734, 0.2665202021598816]

Done!
[[2.7368481159210205, 0.26732170581817627], [2.739569664001465, 0.25986436009407043], [2.7429392337799072, 0.25973647832870483], [2.787505626678467, 0.2458427995443344], [2.7651100158691406, 0.25588634610176086], [2.7461373805999756, 0.2648336887359619], [2.724114418029785, 0.26456424593925476], [2.7261714935302734, 0.2665202021598816]]


### LSTM

In [24]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [50]:
# model_lstm = Sequential()
# model_lstm.add(LSTM(256, input_shape = (1, 3)))
# model_lstm.add(Dense(units=total_unique_words))
# model_lstm.compile(loss='categorical_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy']
#              )

# model_lstm.summary()

In [25]:
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import SpatialDropout1D

In [46]:
#more elaborate model
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_phonemes, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256)               266240    
_________________________________________________________________
dense_6 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 43)                11051     
Total params: 343,083
Trainable params: 343,083
Non-trainable params: 0
_________________________________________________________________


In [26]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)

single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)

In [27]:
train_set, train_labels = get_labels(single_phonemes_train, label=['phonemes'])
test_set, test_labels = get_labels(single_phonemes_test, label=['phonemes'])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=['phonemes'])

  return f(*args, **kwargs)


In [28]:
from keras.utils import to_categorical

train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(533705, 43)


In [50]:
reshaped_values = train_set.values.reshape(-1, 1, 3)
reshaped_values[0][0]

array([ 0, 32, 32])

In [51]:
model_lstm.fit(reshaped_values, train_labels, epochs=16, batch_size=128)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7f60d0b8e898>

In [52]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 3), test_labels))

test loss, test acc: [4.309224605560303, 0.17015229165554047]


In [29]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()

#train_set[["previous_packet", "next_packet", "packet_size"]] = scaler.fit_transform(train_set[["previous_packet", "next_packet", "packet_size"]])
#test_set[["previous_packet", "next_packet", "packet_size"]] = scaler.transform(test_set[["previous_packet", "next_packet", "packet_size"]])

train_set = train_set["packet_size"]
test_set = test_set["packet_size"]
train_set

0         32
1         32
2         31
3         28
4         28
          ..
533700    32
533701    34
533702    39
533703    33
533704    36
Name: packet_size, Length: 533705, dtype: int64

In [30]:
#more elaborate model
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 1), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_phonemes, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256)               264192    
_________________________________________________________________
dense (Dense)                (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 43)                11051     
Total params: 341,035
Trainable params: 341,035
Non-trainable params: 0
_________________________________________________________________


In [31]:
reshaped_values = train_set.values.reshape(-1, 1, 1)
#reshaped_values = train_set
reshaped_values[0][0]

array([32])

In [32]:
model_lstm.fit(reshaped_values, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7f450da574e0>

In [33]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 1), test_labels))

test loss, test acc: [4.382974147796631, 0.13401971757411957]


#### Words

In [52]:
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_words, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256)               266240    
_________________________________________________________________
dense_12 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 6387)              1641459   
Total params: 1,973,491
Trainable params: 1,973,491
Non-trainable params: 0
_________________________________________________________________


In [53]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)

single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)

In [54]:
train_set, train_labels = get_labels(single_phonemes_train, label=['words'])
test_set, test_labels = get_labels(single_phonemes_test, label=['words'])

train_labels, test_labels, encoder = prepare_labels(train_labels, test_labels, label=['words'])

  return f(*args, **kwargs)


In [55]:
from keras.utils import to_categorical

train_labels = to_categorical(train_labels, num_classes=total_unique_words)
test_labels = to_categorical(test_labels, num_classes=total_unique_words)
print(train_labels.shape)

(533705, 6387)


In [56]:
reshaped_values = train_set.values.reshape(-1, 1, 3)
reshaped_values[0][0]

array([ 0, 32, 32])

In [57]:
model_lstm.fit(reshaped_values, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7f658964bbe0>

In [58]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 3), test_labels))

test loss, test acc: [10.582669258117676, 0.17642487585544586]


### Decision Tree

In [59]:
from sklearn.tree import DecisionTreeClassifier

In [60]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)

single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)

In [61]:
train_set, train_labels = get_labels(single_phonemes_train, label=['phonemes'])
test_set, test_labels = get_labels(single_phonemes_test, label=['phonemes'])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=['phonemes'])

  return f(*args, **kwargs)


In [62]:
tree_clf_pipeline = Pipeline(
    [
        (
            "clf",
            DecisionTreeClassifier(criterion="entropy", max_depth=None, splitter="best",
                                   min_samples_split=2, random_state=42),
        ),
    ]
)

In [63]:
print("Starting!")
tree_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.3999
Test accuracy : 0.2136


#### Words

In [64]:
train_set, train_labels = get_labels(single_phonemes_train, label=['words'])
test_set, test_labels = get_labels(single_phonemes_test, label=['words'])

train_labels, test_labels, encoder = prepare_labels(train_labels, test_labels, label=['words'])

  return f(*args, **kwargs)


In [65]:
tree_clf_pipeline = Pipeline(
    [
        (
            "clf",
            DecisionTreeClassifier(criterion="entropy", max_depth=None, splitter="best",
                                   min_samples_split=2, random_state=42),
        ),
    ]
)

In [66]:
print("Starting!")
tree_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.3337
Test accuracy : 0.1575


### KNN

In [75]:
from sklearn.neighbors import KNeighborsClassifier

In [76]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)

single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)

In [77]:
train_set, train_labels = get_labels(single_phonemes_train, label=['phonemes'])
test_set, test_labels = get_labels(single_phonemes_test, label=['phonemes'])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=['phonemes'])

  return f(*args, **kwargs)


In [32]:
parameters = {'n_neighbors':[16,32], 'weights':['uniform', 'distance'], 'n_jobs':[-1]}

In [33]:
orig_clf = KNeighborsClassifier()
gscv_clf = GridSearchCV(orig_clf, parameters, n_jobs = -1, cv=cv_dialect_splitter())

In [34]:
print("Starting!")
gscv_clf.fit(train_set, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set, test_labels):.4f}")

Starting!
Finished!
Best: 0.136635 using {'n_jobs': -1, 'n_neighbors': 32, 'weights': 'uniform'}
Train accuracy: 0.1700
Test accuracy : 0.1343


In [81]:
knn_clf_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(32, weights='distance', n_jobs=-1)
        ),
    ]
)

In [82]:
print("Starting!")
knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.3990
Test accuracy : 0.2178


#### Words

In [83]:
train_set, train_labels = get_labels(single_phonemes_train, label=['words'])
test_set, test_labels = get_labels(single_phonemes_test, label=['words'])

train_labels, test_labels, encoder = prepare_labels(train_labels, test_labels, label=['words'])

  return f(*args, **kwargs)


In [32]:
parameters = {'n_neighbors':[16,32], 'weights':['uniform', 'distance'], 'n_jobs':[-1]}

In [33]:
orig_clf = KNeighborsClassifier()
gscv_clf = GridSearchCV(orig_clf, parameters, n_jobs = -1, cv=cv_dialect_splitter())

In [34]:
print("Starting!")
gscv_clf.fit(train_set, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set, test_labels):.4f}")

Starting!
Finished!
Best: 0.136635 using {'n_jobs': -1, 'n_neighbors': 32, 'weights': 'uniform'}
Train accuracy: 0.1700
Test accuracy : 0.1343


In [84]:
knn_clf_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(32, weights='distance', n_jobs=-1)
        ),
    ]
)

In [85]:
print("Starting!")
knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.3328
Test accuracy : 0.1615


### Exploring the 2 most spoken sentences

In [23]:
sentence_1 = "She had your dark suit in greasy wash water all year."
sentence_2 = "Don't ask me to carry an oily rag like that."

In [24]:
two_sentence_train = single_phonemes_train.loc[single_phonemes_train["sentence"].isin([sentence_1, sentence_2])]
two_sentence_train.reset_index(inplace=True)
two_sentence_train

Unnamed: 0,index,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,0,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,0,32,32,"(0, 32)","(32, 32)","(0, 32, 32)",,"(,)"
1,1,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,31,32,"(32, 32)","(31, 32)","(32, 32, 31)",,"(,)"
2,2,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,28,31,"(32, 31)","(28, 31)","(32, 31, 28)",,"(,)"
3,3,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,31,28,28,"(31, 28)","(28, 28)","(31, 28, 28)",,"(,)"
4,4,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,28,36,28,"(28, 28)","(36, 28)","(28, 28, 36)",,"(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109467,532803,DR8-MTCS0-SA2.CSV,DR8,MTCS0,SA2,Don't ask me to carry an oily rag like that.,40,27,30,"(40, 30)","(27, 30)","(40, 30, 27)",t,"(that,)"
109468,532804,DR8-MTCS0-SA2.CSV,DR8,MTCS0,SA2,Don't ask me to carry an oily rag like that.,30,47,27,"(30, 27)","(47, 27)","(30, 27, 47)",t,"(that,)"
109469,532805,DR8-MTCS0-SA2.CSV,DR8,MTCS0,SA2,Don't ask me to carry an oily rag like that.,47,40,49,"(47, 49)","(40, 49)","(47, 49, 40)",,"(,)"
109470,532806,DR8-MTCS0-SA2.CSV,DR8,MTCS0,SA2,Don't ask me to carry an oily rag like that.,49,50,40,"(49, 40)","(50, 40)","(49, 40, 50)",,"(,)"


In [25]:
two_sentence_test = single_phonemes_test.loc[single_phonemes_test["sentence"].isin([sentence_1, sentence_2])]
two_sentence_test.reset_index(inplace=True, drop=True)
two_sentence_test

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,0,35,30,"(0, 30)","(35, 30)","(0, 30, 35)",,"(,)"
1,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,30,43,35,"(30, 35)","(43, 35)","(30, 35, 43)",,"(,)"
2,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,35,26,43,"(35, 43)","(26, 43)","(35, 43, 26)",,"(,)"
3,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,43,30,26,"(43, 26)","(30, 26)","(43, 26, 30)",,"(,)"
4,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,26,31,30,"(26, 30)","(31, 30)","(26, 30, 31)",,"(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40128,DR8-MSLB0-SA2.CSV,DR8,MSLB0,SA2,Don't ask me to carry an oily rag like that.,62,47,75,"(62, 75)","(47, 75)","(62, 75, 47)",æ,"(that,)"
40129,DR8-MSLB0-SA2.CSV,DR8,MSLB0,SA2,Don't ask me to carry an oily rag like that.,47,36,32,"(47, 32)","(36, 32)","(47, 32, 36)",t,"(that,)"
40130,DR8-MSLB0-SA2.CSV,DR8,MSLB0,SA2,Don't ask me to carry an oily rag like that.,32,27,36,"(32, 36)","(27, 36)","(32, 36, 27)",t,"(that,)"
40131,DR8-MSLB0-SA2.CSV,DR8,MSLB0,SA2,Don't ask me to carry an oily rag like that.,27,24,26,"(27, 26)","(24, 26)","(27, 26, 24)",,"(,)"


In [26]:
import keras
import tensorflow as tf

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import classification_report

In [27]:
train_set, train_labels = get_labels(two_sentence_train, label=["words"])
test_set, test_labels = get_labels(two_sentence_test, label=["words"])

In [28]:
total_labels_2 = train_labels.append(test_labels)
print(len(pd.unique(train_labels.words)))
print(len(pd.unique(test_labels.words)))
total_unique_words_2 = len(pd.unique(total_labels_2.words))
total_unique_words_2

24
24


24

In [29]:
train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [30]:
train_labels = to_categorical(train_labels, num_classes=total_unique_words_2)
test_labels = to_categorical(test_labels, num_classes=total_unique_words_2)
print(train_labels.shape)

(109472, 24)


#### BE CAREFUL ABOUT TOTAL WORDS

In [31]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_words_2, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 24)                6168      
Total params: 139,544
Trainable params: 139,544
Non-trainable params: 0
_________________________________________________________________


In [32]:
model.fit(train_set, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7f4392d7c3c8>

In [33]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [2.5164101123809814, 0.2449605017900467]


#### Phonemes

In [34]:
train_set, train_labels = get_labels(two_sentence_train, label=["phonemes"])
test_set, test_labels = get_labels(two_sentence_test, label=["phonemes"])

In [35]:
total_labels_2 = train_labels.append(test_labels)
print(len(pd.unique(train_labels.phonemes)))
print(len(pd.unique(test_labels.phonemes)))
total_unique_phonemes_2 = len(pd.unique(total_labels_2.phonemes))
total_unique_phonemes_2

40
38


40

In [36]:
train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [37]:
train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes_2)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes_2)
print(train_labels.shape)

(109472, 40)


In [38]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_phonemes_2, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 512)               2048      
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_5 (Dense)              (None, 40)                10280     
Total params: 143,656
Trainable params: 143,656
Non-trainable params: 0
_________________________________________________________________


In [39]:
model.fit(train_set, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7f4392df2eb8>

In [40]:
print("train loss, train acc:", model.evaluate(train_set, train_labels))

train loss, train acc: [2.439312219619751, 0.2837803363800049]


In [41]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [2.48190975189209, 0.27832457423210144]


### LSTM

In [42]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [43]:
# model_lstm = Sequential()
# model_lstm.add(LSTM(256, input_shape = (1, 3)))
# model_lstm.add(Dense(units=total_unique_words))
# model_lstm.compile(loss='categorical_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy']
#              )

# model.summary()

In [44]:
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import SpatialDropout1D

In [45]:
train_set, train_labels = get_labels(two_sentence_train, label=["words"])
test_set, test_labels = get_labels(two_sentence_test, label=["words"])

In [46]:
total_labels_2 = train_labels.append(test_labels)
print(len(pd.unique(train_labels.words)))
print(len(pd.unique(test_labels.words)))
total_unique_words_2 = len(pd.unique(total_labels_2.words))
total_unique_words_2

24
24


24

In [47]:
train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [48]:
train_labels = to_categorical(train_labels, num_classes=total_unique_words_2)
test_labels = to_categorical(test_labels, num_classes=total_unique_words_2)
print(train_labels.shape)

(109472, 24)


In [49]:
reshaped_values = train_set.values.reshape(-1, 1, 3)
reshaped_values[0][0]

array([ 0, 32, 32])

In [50]:
#more elaborate model
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_words_2, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256)               266240    
_________________________________________________________________
dense_6 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 24)                6168      
Total params: 338,200
Trainable params: 338,200
Non-trainable params: 0
_________________________________________________________________


In [51]:
model_lstm.fit(reshaped_values, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7f4392ec3400>

In [52]:
print("train loss, train acc:", model_lstm.evaluate(reshaped_values, train_labels))

train loss, train acc: [3.512779712677002, 0.17978112399578094]


In [53]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 3), test_labels))

test loss, test acc: [3.6062262058258057, 0.17287519574165344]


#### Phonemes:

In [54]:
train_set, train_labels = get_labels(two_sentence_train, label=["phonemes"])
test_set, test_labels = get_labels(two_sentence_test, label=["phonemes"])

In [55]:
total_labels_2 = train_labels.append(test_labels)
print(len(pd.unique(train_labels.phonemes)))
print(len(pd.unique(test_labels.phonemes)))
total_unique_phonemes_2 = len(pd.unique(total_labels_2.phonemes))
total_unique_phonemes_2

40
38


40

In [56]:
train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

In [57]:
train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes_2)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes_2)
print(train_labels.shape)

(109472, 40)


In [58]:
reshaped_values = train_set.values.reshape(-1, 1, 3)
reshaped_values[0][0]

array([ 0, 32, 32])

In [59]:
#more elaborate model
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_phonemes_2, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               266240    
_________________________________________________________________
dense_8 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 40)                10280     
Total params: 342,312
Trainable params: 342,312
Non-trainable params: 0
_________________________________________________________________


In [60]:
model_lstm.fit(reshaped_values, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7f43931166a0>

In [61]:
print("train loss, train acc:", model_lstm.evaluate(reshaped_values, train_labels))

train loss, train acc: [4.83518123626709, 0.18505188822746277]


In [62]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 3), test_labels))

test loss, test acc: [4.983070373535156, 0.17728552222251892]


### Decisions Tree

In [63]:
from sklearn.tree import DecisionTreeClassifier

In [64]:
train_set, train_labels = get_labels(two_sentence_train, label=["words"])
test_set, test_labels = get_labels(two_sentence_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

In [65]:
tree_clf_pipeline = Pipeline(
    [
        (
            "clf",
            DecisionTreeClassifier(criterion="entropy", max_depth=None, splitter="best",
                                   min_samples_split=2, random_state=42),
        ),
    ]
)

In [66]:
print("Starting!")
tree_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.5684
Test accuracy : 0.1834


Phonemes

In [67]:
train_set, train_labels = get_labels(two_sentence_train, label=["phonemes"])
test_set, test_labels = get_labels(two_sentence_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [68]:
tree_clf_pipeline = Pipeline(
    [
        (
            "clf",
            DecisionTreeClassifier(criterion="entropy", max_depth=None, splitter="best",
                                   min_samples_split=2, random_state=42),
        ),
    ]
)

In [69]:
print("Starting!")
tree_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.5770
Test accuracy : 0.2019


### KNN

In [70]:
from sklearn.neighbors import KNeighborsClassifier

In [71]:
train_set, train_labels = get_labels(two_sentence_train, label=["words"])
test_set, test_labels = get_labels(two_sentence_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(two_sentence_train, test_labels)

  return f(*args, **kwargs)


In [32]:
parameters = {'n_neighbors':[16,32], 'weights':['uniform', 'distance'], 'n_jobs':[-1]}

In [33]:
orig_clf = KNeighborsClassifier()
gscv_clf = GridSearchCV(orig_clf, parameters, n_jobs = -1, cv=cv_dialect_splitter())

In [34]:
print("Starting!")
gscv_clf.fit(train_set, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set, test_labels):.4f}")

Starting!
Finished!
Best: 0.136635 using {'n_jobs': -1, 'n_neighbors': 32, 'weights': 'uniform'}
Train accuracy: 0.1700
Test accuracy : 0.1343


In [72]:
knn_clf_pipeline = Pipeline(
    [
        (
            "scaler",
            StandardScaler()
        ),
        (
            "clf",
            KNeighborsClassifier(32, weights='distance', n_jobs=-1)
        ),
    ]
)

# 20, distance => 0.2887, 0.1203
# 32, uniform => 0.1700, 0.1343
# 32, distance => 0.2912, 0.1216

In [73]:
print("Starting!")
knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.5684
Test accuracy : 0.2000


For the search space of 32 nearest neighbours we get around 12% success rate on our test data (which is around 31436 words). I have listed other parameters and their resulting percentages in the comments in the code cell. Also worth noting is that "StandardScaler" only worsens (not tested on skype) our predictions.

Now let's try our luck with phonemes:

In [74]:
train_set, train_labels = get_labels(two_sentence_train, label=["phonemes"])
test_set, test_labels = get_labels(two_sentence_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [32]:
parameters = {'n_neighbors':[16,32], 'weights':['uniform', 'distance'], 'n_jobs':[-1]}

In [33]:
orig_clf = KNeighborsClassifier()
gscv_clf = GridSearchCV(orig_clf, parameters, n_jobs = -1, cv=cv_dialect_splitter())

In [34]:
print("Starting!")
gscv_clf.fit(train_set, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set, test_labels):.4f}")

Starting!
Finished!
Best: 0.136635 using {'n_jobs': -1, 'n_neighbors': 32, 'weights': 'uniform'}
Train accuracy: 0.1700
Test accuracy : 0.1343


In [75]:
knn_clf_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(32, weights='distance', n_jobs=-1)
        ),
    ]
)
# 20, distance => 0.3221, 0.1377
# 32, uniform => 0.2093, 0.1574
# 32, distance => 0.3265, 0.1410

In [76]:
print("Starting!")
knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.5770
Test accuracy : 0.2241
