In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

sns.set()  # make plots nicer

np.random.seed(42)  # set seed for reproducibility

In [2]:
def file_parser_with_prev_next(path):
    file = open(path, 'r')
    lines = file.readlines()
    
    file_name = [path.split('/')[-1]]
    sentence = ""
    file_data = []
    
    has_value = False
    previous = 0
    
    for line in lines:
        line = line.strip()
        
        # if there are only 2 informations on line and second is h#, then ignore
        # strip line, split primarly on ; secondary on ,
        if (line.startswith('#')):
            if (not sentence):
                sentence = line[len('# Sentence: "'): len(line) - 1]
            continue
        
        line = line.split(';')
        
        if (len(line) == 1):
            #lines containing only their packet size and nothing else, they should be added
            #TODO
            line += [""]
            line += [""]
            #continue
        
        if (len(line) == 2):
            #this tries to remove most of the silence at the start of the recording
            #potentionally harmfull as we shouldn't clean test data this way (we will be reading labels)
            #if (line[1] == 'h#'):
            #    continue
            line += [""]
        
        line[1] = tuple(line[1].split(','))
        line[2] = tuple(list(map(lambda a: a.strip('"'), line[2].split(','))))
        
        if (has_value):
            file_data[-1][-4] = line[0]
           
        # file_type and sentence contain duplicate informations, but are kept for readability
        split_filename = file_name[0].split('-')
        
        line = file_name + [split_filename[0]] + [split_filename[1]] + [split_filename[2][0:-4]] + [sentence] + [previous] + [0] + line
        #adding previous as feature
        previous = line[-3]
        file_data += [line]
        
        #adding next frame as feature
        has_value = True
        
    return pd.DataFrame(file_data, columns=['file', 'dialect', 'speaker', 'sentence_id', 'sentence', 'previous_packet', 'next_packet','packet_size', 'phonemes', 'words'])

def load_files_with_prev_next(directory):
    filelist = os.listdir(directory)
    #read them into pandas
    df_list = [file_parser_with_prev_next(directory+file) for file in filelist]
    #concatenate them together
    return pd.concat(df_list, ignore_index=True)

def convert_types(data_frame):
    data_frame['packet_size'] = pd.to_numeric(data_frame['packet_size'])
    data_frame['previous_packet'] = pd.to_numeric(data_frame['previous_packet'])
    data_frame['next_packet'] = pd.to_numeric(data_frame['next_packet'])

    data_frame['file'] = data_frame['file'].astype('category')
    data_frame['sentence'] = data_frame['sentence'].astype('category')
    
    data_frame['dialect'] = data_frame['dialect'].astype('category')
    data_frame['speaker'] = data_frame['speaker'].astype('category')
    data_frame['sentence_id'] = data_frame['sentence_id'].astype('category')

In [3]:
skype_data_train = load_files_with_prev_next("./../data/skype_train_data/")
skype_data_test = load_files_with_prev_next("./../data/skype_test_data/")
convert_types(skype_data_train)
convert_types(skype_data_test)
skype_data_test

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,phonemes,words
0,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,0,35,30,"(h#,)","(,)"
1,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,30,43,35,"(h#,)","(,)"
2,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,35,26,43,"(h#,)","(,)"
3,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,43,30,26,"(h#,)","(,)"
4,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,26,31,30,"(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...
258516,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,40,43,46,"(h#,)","(,)"
258517,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,46,41,43,"(h#,)","(,)"
258518,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,43,34,41,"(h#,)","(,)"
258519,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,41,33,34,"(h#,)","(,)"


In [4]:
def add_surrounding(data_frame):
    data_frame['prev_curr'] = list(zip(data_frame.previous_packet, data_frame.packet_size))
    data_frame['next_curr'] = list(zip(data_frame.next_packet, data_frame.packet_size))
    data_frame['packet_surrounding'] = list(zip(data_frame.previous_packet, data_frame.packet_size, data_frame.next_packet))
    
    #data_frame['prev_curr'] = data_frame['prev_curr'].astype('category')
    #data_frame['next_curr'] = data_frame['next_curr'].astype('category')
    #data_frame['packet_surrounding'] = data_frame['packet_surrounding'].astype('category')

add_surrounding(skype_data_train)
add_surrounding(skype_data_test)

skype_data_train = skype_data_train[['file', 'dialect', 'speaker', 'sentence_id', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
skype_data_test = skype_data_test[['file', 'dialect', 'speaker', 'sentence_id', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
skype_data_train

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,0,32,32,"(0, 32)","(32, 32)","(0, 32, 32)","(h#,)","(,)"
1,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,31,32,"(32, 32)","(31, 32)","(32, 32, 31)","(h#,)","(,)"
2,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,28,31,"(32, 31)","(28, 31)","(32, 31, 28)","(h#,)","(,)"
3,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,31,28,28,"(31, 28)","(28, 28)","(31, 28, 28)","(h#,)","(,)"
4,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,28,36,28,"(28, 28)","(36, 28)","(28, 28, 36)","(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
707433,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,47,34,32,"(47, 32)","(34, 32)","(47, 32, 34)","(h#,)","(,)"
707434,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,32,39,34,"(32, 34)","(39, 34)","(32, 34, 39)","(h#,)","(,)"
707435,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,34,33,39,"(34, 39)","(33, 39)","(34, 39, 33)","(h#,)","(,)"
707436,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,39,36,33,"(39, 33)","(36, 33)","(39, 33, 36)","(h#,)","(,)"


### Something about preparing

In [5]:
skype_data_train.loc[:, ["previous_packet", "packet_size", "next_packet"]]

Unnamed: 0,previous_packet,packet_size,next_packet
0,0,32,32
1,32,32,31
2,32,31,28
3,31,28,28
4,28,28,36
...,...,...,...
707433,47,32,34
707434,32,34,39
707435,34,39,33
707436,39,33,36


In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [7]:
# add removal of labels for the test_dataset
def get_labels(df, label=["words"], feature=["previous_packet", "packet_size", "next_packet"]):
    labels = df.loc[:, label]
    features = df.loc[:, feature]
    return features, labels

In [8]:
def prepare_labels(train_labels, test_labels, label=["words"]):
    train_labels = train_labels.astype('category')
    test_labels = test_labels.astype('category')
    
    total_labels = train_labels.append(test_labels)
    
    lab_enc = LabelEncoder()
    lab_enc.fit(total_labels[label])

    train_labels = lab_enc.transform(train_labels[label])
    test_labels = lab_enc.transform(test_labels[label])
    
    return train_labels, test_labels, lab_enc

### Additional preprocessing

In [9]:
# there are no "empty" phonemes
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_train

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,0,32,32,"(0, 32)","(32, 32)","(0, 32, 32)","(h#,)","(,)"
1,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,31,32,"(32, 32)","(31, 32)","(32, 32, 31)","(h#,)","(,)"
2,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,28,31,"(32, 31)","(28, 31)","(32, 31, 28)","(h#,)","(,)"
3,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,31,28,28,"(31, 28)","(28, 28)","(31, 28, 28)","(h#,)","(,)"
4,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,28,36,28,"(28, 28)","(36, 28)","(28, 28, 36)","(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
533700,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,47,34,32,"(47, 32)","(34, 32)","(47, 32, 34)","(h#,)","(,)"
533701,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,32,39,34,"(32, 34)","(39, 34)","(32, 34, 39)","(h#,)","(,)"
533702,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,34,33,39,"(34, 39)","(33, 39)","(34, 39, 33)","(h#,)","(,)"
533703,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,39,36,33,"(39, 33)","(36, 33)","(39, 33, 36)","(h#,)","(,)"


In [10]:
len(pd.unique(single_phonemes_train.phonemes))

62

In [11]:
pd.unique(single_phonemes_train.phonemes)

array([('h#',), ('sh',), ('ix',), ('hv',), ('eh',), ('jh',), ('ih',),
       ('dcl',), ('ah',), ('kcl',), ('k',), ('s',), ('ux',), ('q',),
       ('en',), ('r',), ('w',), ('ao',), ('axr',), ('l',), ('y',),
       ('uh',), ('n',), ('ae',), ('dx',), ('oy',), ('ax',), ('gcl',),
       ('dh',), ('tcl',), ('iy',), ('v',), ('t',), ('f',), ('ow',),
       ('d',), ('hh',), ('ch',), ('bcl',), ('aa',), ('em',), ('ng',),
       ('m',), ('ay',), ('th',), ('ax-h',), ('ey',), ('p',), ('pcl',),
       ('aw',), ('er',), ('z',), ('epi',), ('el',), ('uw',), ('g',),
       ('',), ('b',), ('pau',), ('zh',), ('nx',), ('eng',)], dtype=object)

Taken from: https://github.com/jhasegaw/phonecodes/blob/master/src/phonecode_tables.py

In [12]:
arpa_to_ipa = {
    'aa':'ɑ',
    'ae':'æ',
    'ah':'ʌ',
    'ah0':'ə',
    'ao':'ɔ',
    'aw':'aʊ',
    'ay':'aɪ',
    'eh':'ɛ',
    'er':'ɝ',
    'er0':'ɚ',
    'ey':'eɪ',
    'ih':'ɪ',
    'ih0':'ɨ',
    'iy':'i',
    'ow':'oʊ',
    'oy':'ɔɪ',
    'uh':'ʊ',
    'uw':'u',
    'b':'b',
    'ch':'tʃ',
    'd':'d',
    'dh':'ð',
    'el':'l̩',
    'em':'m̩',
    'en':'n̩',
    'f':'f',
    'g':'ɡ',
    'hh':'h',
    'jh':'dʒ',
    'k':'k',
    'l':'l',
    'm':'m',
    'n':'n',
    'ng':'ŋ',
    'p':'p',
    'q':'ʔ',
    'r':'ɹ',
    's':'s',
    'sh':'ʃ',
    't':'t',
    'th':'θ',
    'v':'v',
    'w':'w',
    'wh':'ʍ',
    'y':'j',
    'z':'z',
    'zh':'ʒ',

    'ax':'ə',
    'ax-h':'ə̥',
    'axr':'ɚ',
    'bcl':'b',
    'dcl':'d',
    'dx':'ɾ',
    'eng':'ŋ̍',
    'epi':'',
    'gcl':'g',
    'hv':'ɦ',
    'h#':'',
    'ix':'ɨ',
    'kcl':'k',
    'nx':'ɾ̃',
    'pau':'',
    'pcl':'p',
    'tcl':'t',
    'ux':'ʉ',
    '':'',
}

In [13]:
len(arpa_to_ipa)

66

This modification is based on this: https://en.wikipedia.org/wiki/ARPABET (+ minor guessing)

In [14]:
ipa_allophone = {
    'ŋ̍':'n', #should be ŋ
    'ə̥':'ɛ',
    'ɨ':'ɪ',
    'n̩':'n',
    'm̩':'m',
    'ŋ':'n',
    'ɾ̃':'n',
    'ð':'θ',
    'ʉ':'u',
    'ɾ':'d',
    'l̩':'l',
}

In [15]:
uniq_phon = pd.unique(single_phonemes_train.phonemes)
for i in range(len(uniq_phon)):
    uniq_phon[i] = arpa_to_ipa.get(uniq_phon[i][0], uniq_phon[i][0])
    uniq_phon[i] = ipa_allophone.get(uniq_phon[i], uniq_phon[i])
    
uniq_phon

array(['', 'ʃ', 'ɪ', 'ɦ', 'ɛ', 'dʒ', 'ɪ', 'd', 'ʌ', 'k', 'k', 's', 'u',
       'ʔ', 'n', 'ɹ', 'w', 'ɔ', 'ɚ', 'l', 'j', 'ʊ', 'n', 'æ', 'd', 'ɔɪ',
       'ə', 'g', 'θ', 't', 'i', 'v', 't', 'f', 'oʊ', 'd', 'h', 'tʃ', 'b',
       'ɑ', 'm', 'n', 'm', 'aɪ', 'θ', 'ɛ', 'eɪ', 'p', 'p', 'aʊ', 'ɝ', 'z',
       '', 'l', 'u', 'ɡ', '', 'b', '', 'ʒ', 'n', 'n'], dtype=object)

In [16]:
len(np.unique(uniq_phon))

43

Now modifying our input dataset:

In [17]:
#input is expected to be a tuple
def convert_phoneme(phoneme):
    tmp_1 = arpa_to_ipa.get(phoneme[0], phoneme[0])
    tmp_2 = ipa_allophone.get(tmp_1, tmp_1)
    return tmp_2

In [18]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,0,35,30,"(0, 30)","(35, 30)","(0, 30, 35)","(h#,)","(,)"
1,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,30,43,35,"(30, 35)","(43, 35)","(30, 35, 43)","(h#,)","(,)"
2,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,35,26,43,"(35, 43)","(26, 43)","(35, 43, 26)","(h#,)","(,)"
3,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,43,30,26,"(43, 26)","(30, 26)","(43, 26, 30)","(h#,)","(,)"
4,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,26,31,30,"(26, 30)","(31, 30)","(26, 30, 31)","(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195608,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,40,43,46,"(40, 46)","(43, 46)","(40, 46, 43)","(h#,)","(,)"
195609,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,46,41,43,"(46, 43)","(41, 43)","(46, 43, 41)","(h#,)","(,)"
195610,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,43,34,41,"(43, 41)","(34, 41)","(43, 41, 34)","(h#,)","(,)"
195611,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,41,33,34,"(41, 34)","(33, 34)","(41, 34, 33)","(h#,)","(,)"


In [19]:
single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)
single_phonemes_test

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,0,35,30,"(0, 30)","(35, 30)","(0, 30, 35)",,"(,)"
1,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,30,43,35,"(30, 35)","(43, 35)","(30, 35, 43)",,"(,)"
2,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,35,26,43,"(35, 43)","(26, 43)","(35, 43, 26)",,"(,)"
3,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,43,30,26,"(43, 26)","(30, 26)","(43, 26, 30)",,"(,)"
4,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,26,31,30,"(26, 30)","(31, 30)","(26, 30, 31)",,"(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195608,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,40,43,46,"(40, 46)","(43, 46)","(40, 46, 43)",,"(,)"
195609,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,46,41,43,"(46, 43)","(41, 43)","(46, 43, 41)",,"(,)"
195610,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,43,34,41,"(43, 41)","(34, 41)","(43, 41, 34)",,"(,)"
195611,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,41,33,34,"(41, 34)","(33, 34)","(41, 34, 33)",,"(,)"


In [20]:
tmp = pd.unique(single_phonemes_test.phonemes)
print(len(tmp))
tmp

43


array(['', 'ʃ', 'i', 'ɦ', 'æ', 'd', 'ɝ', 'ɑ', 'ɹ', 'k', 's', 'u', 'ɪ',
       'n', 'g', 'ɡ', 'w', 'ʔ', 'ɔ', 'l', 'j', 'ɚ', 'oʊ', 't', 'ɛ', 'ɔɪ',
       'aɪ', 'θ', 'h', 'z', 'p', 'ə', 'b', 'f', 'v', 'm', 'aʊ', 'ʌ', 'eɪ',
       'tʃ', 'ʊ', 'dʒ', 'ʒ'], dtype=object)

In [21]:
train_set, train_labels = get_labels(single_phonemes_train, label=['phonemes'])
test_set, test_labels = get_labels(single_phonemes_test, label=['phonemes'])

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.phonemes)))
print(len(pd.unique(test_labels.phonemes)))
total_unique_phonemes = len(pd.unique(total_labels.phonemes))
total_unique_phonemes

43
43


43

In [22]:
train_set, train_labels = get_labels(single_phonemes_train)
test_set, test_labels = get_labels(single_phonemes_test)

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.words)))
print(len(pd.unique(test_labels.words)))
total_unique_words = len(pd.unique(total_labels.words))
total_unique_words

5104
2464


6387

## Models:

In [23]:
import keras
import tensorflow as tf

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import classification_report

In [24]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)

single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)

In [25]:
train_set, train_labels = get_labels(single_phonemes_train, label=['phonemes'])
test_set, test_labels = get_labels(single_phonemes_test, label=['phonemes'])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=['phonemes'])

  return f(*args, **kwargs)


In [26]:
print(train_set.shape)
print(train_labels.shape)

(533705, 3)
(533705,)


In [27]:
train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(533705, 43)


In [28]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_phonemes, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.fit(train_set, train_labels, epochs=16, batch_size=256)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7f2619944048>

In [30]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [2.7815287113189697, 0.2520640194416046]


In [31]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()

train_set[["previous_packet", "next_packet", "packet_size"]] = scaler.fit_transform(train_set[["previous_packet", "next_packet", "packet_size"]])
test_set[["previous_packet", "next_packet", "packet_size"]] = scaler.transform(test_set[["previous_packet", "next_packet", "packet_size"]])
test_set

Unnamed: 0,previous_packet,packet_size,next_packet
0,0.000000,0.217391,0.343137
1,0.294118,0.271739,0.421569
2,0.343137,0.358696,0.254902
3,0.421569,0.173913,0.294118
4,0.254902,0.217391,0.303922
...,...,...,...
195608,0.392157,0.391304,0.421569
195609,0.450980,0.358696,0.401961
195610,0.421569,0.336957,0.333333
195611,0.401961,0.260870,0.323529


In [32]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_phonemes, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 512)               2048      
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_5 (Dense)              (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.fit(train_set, train_labels, epochs=16, batch_size=256)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7f26199bbc50>

In [34]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [2.7804369926452637, 0.25184932351112366]


### LSTM

In [24]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [26]:
# model_lstm = Sequential()
# model_lstm.add(LSTM(256, input_shape = (1, 3)))
# model_lstm.add(Dense(units=total_unique_words))
# model_lstm.compile(loss='categorical_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy']
#              )

# model_lstm.summary()

In [27]:
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import SpatialDropout1D

In [28]:
#more elaborate model
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_phonemes, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               266240    
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 43)                11051     
Total params: 343,083
Trainable params: 343,083
Non-trainable params: 0
_________________________________________________________________


In [29]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)

single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)

In [30]:
train_set, train_labels = get_labels(single_phonemes_train, label=['phonemes'])
test_set, test_labels = get_labels(single_phonemes_test, label=['phonemes'])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=['phonemes'])

  return f(*args, **kwargs)


In [31]:
from keras.utils import to_categorical

train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(533705, 43)


In [32]:
reshaped_values = train_set.values.reshape(-1, 1, 3)
reshaped_values[0][0]

array([ 0, 32, 32])

In [33]:
model_lstm.fit(reshaped_values, train_labels, epochs=16, batch_size=128)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7f8ca99835c0>

In [34]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 3), test_labels))

test loss, test acc: [4.356447696685791, 0.17691053450107574]


In [35]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()

train_set[["previous_packet", "next_packet", "packet_size"]] = scaler.fit_transform(train_set[["previous_packet", "next_packet", "packet_size"]])
test_set[["previous_packet", "next_packet", "packet_size"]] = scaler.transform(test_set[["previous_packet", "next_packet", "packet_size"]])
test_set

Unnamed: 0,previous_packet,packet_size,next_packet
0,0.000000,0.217391,0.343137
1,0.294118,0.271739,0.421569
2,0.343137,0.358696,0.254902
3,0.421569,0.173913,0.294118
4,0.254902,0.217391,0.303922
...,...,...,...
195608,0.392157,0.391304,0.421569
195609,0.450980,0.358696,0.401961
195610,0.421569,0.336957,0.333333
195611,0.401961,0.260870,0.323529


In [36]:
#more elaborate model
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_phonemes, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 256)               266240    
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 43)                11051     
Total params: 343,083
Trainable params: 343,083
Non-trainable params: 0
_________________________________________________________________


In [37]:
reshaped_values = train_set.values.reshape(-1, 1, 3)
reshaped_values[0][0]

array([0.        , 0.23913043, 0.31372549])

In [38]:
model_lstm.fit(reshaped_values, train_labels, epochs=16, batch_size=128)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7f8ca9bc4f28>

In [39]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 3), test_labels))

test loss, test acc: [4.750559329986572, 0.17720703780651093]


### KFold crossvalidation:

In [46]:
def create_model(output_size):
    model = Sequential()

    model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
    model.add(Dense(units=256, activation='relu'))  # second hidden layer
    model.add(Dense(units=output_size, activation='softmax'))  # output layer

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.summary()
    
    return model

In [54]:
single_phonemes_train = skype_data_train.loc[skype_data_train.phonemes.apply(len) == 1].reset_index(drop=True)
single_phonemes_test = skype_data_test.loc[skype_data_test.phonemes.apply(len) == 1].reset_index(drop=True)

single_phonemes_test['phonemes'] = single_phonemes_test["phonemes"].apply(convert_phoneme)
single_phonemes_train['phonemes'] = single_phonemes_train["phonemes"].apply(convert_phoneme)

In [55]:
train_set, train_labels = get_labels(single_phonemes_train, label=['phonemes'])
test_set, test_labels = get_labels(single_phonemes_test, label=['phonemes'])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=['phonemes'])

  return f(*args, **kwargs)


In [56]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()

train_set[["previous_packet", "next_packet", "packet_size"]] = scaler.fit_transform(train_set[["previous_packet", "next_packet", "packet_size"]])
test_set[["previous_packet", "next_packet", "packet_size"]] = scaler.transform(test_set[["previous_packet", "next_packet", "packet_size"]])

In [57]:
from keras.utils import to_categorical

train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(533705, 43)


In [58]:
dialects = pd.unique(skype_data_train.dialect)
for dialect in dialects:
    print("Now validating on dialect:", dialect)
    
    set_train = train_set.loc[single_phonemes_train["dialect"] != dialect]
    label_train = train_labels[single_phonemes_train["dialect"] != dialect]
    
    validation_set = train_set.loc[single_phonemes_train["dialect"] == dialect]
    validation_labels = train_labels[single_phonemes_train["dialect"] == dialect]
    
    model = create_model(total_unique_phonemes)
    
    display(model.fit(set_train, label_train, epochs=1, batch_size=256))
    
    print("test loss, test acc:", model.evaluate(validation_set, validation_labels))
print("\nDone!")

Now validating on dialect: DR1
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_24 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_25 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_26 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________


<tensorflow.python.keras.callbacks.History at 0x7f8b45ae0048>

test loss, test acc: [2.78438138961792, 0.2584441006183624]
Now validating on dialect: DR2
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_27 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_28 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_29 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________


<tensorflow.python.keras.callbacks.History at 0x7f8b45a59eb8>

test loss, test acc: [2.7862300872802734, 0.25312885642051697]
Now validating on dialect: DR3
Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_30 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_31 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_32 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________


<tensorflow.python.keras.callbacks.History at 0x7f8b459d75f8>

test loss, test acc: [2.7923102378845215, 0.24886325001716614]
Now validating on dialect: DR4
Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_33 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_34 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_35 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________


<tensorflow.python.keras.callbacks.History at 0x7f8b459485c0>

test loss, test acc: [2.833430767059326, 0.23450443148612976]
Now validating on dialect: DR5
Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_36 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_37 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_38 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________


<tensorflow.python.keras.callbacks.History at 0x7f8b45b16b38>

test loss, test acc: [2.814870595932007, 0.24815744161605835]
Now validating on dialect: DR6
Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_39 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_40 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_41 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________


<tensorflow.python.keras.callbacks.History at 0x7f8b45bb3a90>

test loss, test acc: [2.795966863632202, 0.25247427821159363]
Now validating on dialect: DR7
Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_42 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_43 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_44 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________


<tensorflow.python.keras.callbacks.History at 0x7f8b45bf2eb8>

test loss, test acc: [2.7658674716949463, 0.2556726932525635]
Now validating on dialect: DR8
Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_45 (Dense)             (None, 512)               2048      
_________________________________________________________________
dense_46 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_47 (Dense)             (None, 43)                11051     
Total params: 144,427
Trainable params: 144,427
Non-trainable params: 0
_________________________________________________________________


<tensorflow.python.keras.callbacks.History at 0x7f8b45ce61d0>

test loss, test acc: [2.7726612091064453, 0.2581544816493988]
Done
