# Replication of attack on VoIP end-to-end encrypted messengers

## Models

### Loading and preprocessing

We will now try to explore various models on `Whatsapp` dataset. Bellow we will find loading and preprocessing that we have come up with in the analysis section.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

sns.set()  # make plots nicer

np.random.seed(42)  # set seed for reproducibility

In [2]:
def file_parser_with_prev_next(path):
    file = open(path, 'r')
    lines = file.readlines()
    
    file_name = [path.split('/')[-1]]
    sentence = ""
    file_data = []
    
    has_value = False
    previous = 0
    
    for line in lines:
        line = line.strip()
        
        # if there are only 2 informations on line and second is h#, then ignore
        # strip line, split primarly on ; secondary on ,
        if (line.startswith('#')):
            if (not sentence):
                sentence = line[len('# Sentence: "'): len(line) - 1]
            continue
        
        line = line.split(';')
        
        if (len(line) == 1):
            #lines containing only their packet size and nothing else, they should be added
            #TODO
            line += [""]
            line += [""]
            #continue
        
        if (len(line) == 2):
            #this tries to remove most of the silence at the start of the recording
            #potentionally harmfull as we shouldn't clean test data this way (we will be reading labels)
            #if (line[1] == 'h#'):
            #    continue
            line += [""]
        
        line[1] = tuple(line[1].split(','))
        line[2] = tuple(list(map(lambda a: a.strip('"'), line[2].split(','))))
        
        if (has_value):
            file_data[-1][4] = line[0]
           
        # file_type and sentence contain duplicate informations, but are kept for readability
        line = file_name + [file_name[0][0:9]] + [sentence] + [previous] + [0] + line
        #adding previous as feature
        previous = line[5]
        file_data += [line]
        
        #adding next frame as feature
        has_value = True
        
        

    return pd.DataFrame(file_data, columns=['file', 'speaker', 'sentence', 'previous_packet', 'next_packet','packet_size', 'phonemes', 'words'])

def load_files_with_prev_next(directory):
    filelist = os.listdir(directory)
    #read them into pandas
    df_list = [file_parser_with_prev_next(directory+file) for file in filelist]
    #concatenate them together
    return pd.concat(df_list, ignore_index=True)

In [3]:
def convert_types(data_frame):
    data_frame['packet_size'] = pd.to_numeric(data_frame['packet_size'])
    data_frame['previous_packet'] = pd.to_numeric(data_frame['previous_packet'])
    data_frame['next_packet'] = pd.to_numeric(data_frame['next_packet'])

    data_frame['file'] = data_frame['file'].astype('category')
    data_frame['sentence'] = data_frame['sentence'].astype('category')
    data_frame['speaker'] = data_frame['speaker'].astype('category')

In [4]:
whatsapp_data_train = load_files_with_prev_next("./../data/whatsapp_train_data/")
whatsapp_data_test = load_files_with_prev_next("./../data/whatsapp_test_data/")
convert_types(whatsapp_data_train)
convert_types(whatsapp_data_test)
whatsapp_data_test

Unnamed: 0,file,speaker,sentence,previous_packet,next_packet,packet_size,phonemes,words
0,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,0,342,249,"(h#,)","(,)"
1,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,249,335,342,"(h#,)","(,)"
2,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,342,303,335,"(h#,)","(,)"
3,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,335,364,303,"(h#, sh)","(she,)"
4,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,303,418,364,"(sh, iy, hv)","(she, had)"
...,...,...,...,...,...,...,...,...
31584,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,338,370,303,"(r, ao, r)","(our, oriental)"
31585,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,303,314,370,"(r, iy, eh)","(oriental,)"
31586,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,370,303,314,"(eh, n, tcl, t)","(oriental,)"
31587,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,314,295,303,"(t, el, r, ah)","(oriental, rug)"


In [5]:
def add_surrounding(data_frame):
    data_frame['prev_curr'] = list(zip(data_frame.previous_packet, data_frame.packet_size))
    data_frame['next_curr'] = list(zip(data_frame.next_packet, data_frame.packet_size))
    data_frame['packet_surrounding'] = list(zip(data_frame.previous_packet, data_frame.packet_size, data_frame.next_packet))
    
    #data_frame['prev_curr'] = data_frame['prev_curr'].astype('category')
    #data_frame['next_curr'] = data_frame['next_curr'].astype('category')
    #data_frame['packet_surrounding'] = data_frame['packet_surrounding'].astype('category')

add_surrounding(whatsapp_data_train)
add_surrounding(whatsapp_data_test)

whatsapp_data_train = whatsapp_data_train[['file', 'speaker', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
whatsapp_data_test = whatsapp_data_test[['file', 'speaker', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
whatsapp_data_train

Unnamed: 0,file,speaker,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,0,380,227,"(0, 227)","(380, 227)","(0, 227, 380)","(h#,)","(,)"
1,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,227,407,380,"(227, 380)","(407, 380)","(227, 380, 407)","(h#, sh, ix)","(she,)"
2,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,380,350,407,"(380, 407)","(350, 407)","(380, 407, 350)","(ix, hv, eh)","(she, had)"
3,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,407,281,350,"(407, 350)","(281, 350)","(407, 350, 281)","(eh, dcl, jh)","(had, your)"
4,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,350,327,281,"(350, 281)","(327, 281)","(350, 281, 327)","(jh, ih, dcl, d, ah)","(had, your, dark)"
...,...,...,...,...,...,...,...,...,...,...,...
86492,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,286,253,268,"(286, 268)","(253, 268)","(286, 268, 253)","(ay, bcl, b, ih)","(by, big)"
86493,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,268,315,253,"(268, 253)","(315, 253)","(268, 253, 315)","(ih, gcl)","(big,)"
86494,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,253,279,315,"(253, 315)","(279, 315)","(253, 315, 279)","(gcl, t, ih)","(big, tips)"
86495,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,315,392,279,"(315, 279)","(392, 279)","(315, 279, 392)","(ih, pcl, p)","(tips,)"


### Preparing data

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [7]:
# add removal of labels for the test_dataset
def get_labels(df, label=["words"], feature=["previous_packet", "packet_size", "next_packet"]):
    labels = df[label]
    features = df[feature]
    return features, labels

In [8]:
def prepare_labels(train_labels, test_labels, label=["words"]):
    train_labels = train_labels.astype('category')
    test_labels = test_labels.astype('category')
    
    total_labels = train_labels.append(test_labels)
    
    lab_enc = LabelEncoder()
    lab_enc.fit(total_labels[label])

    train_labels = lab_enc.transform(train_labels[label])
    test_labels = lab_enc.transform(test_labels[label])
    
    return train_labels, test_labels, lab_enc

In [9]:
train_set, train_labels = get_labels(whatsapp_data_train)
test_set, test_labels = get_labels(whatsapp_data_test)

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.words)))
print(len(pd.unique(test_labels.words)))
total_unique_words = len(pd.unique(total_labels.words))
total_unique_words

16168
6739


21317

Now we can see that we have a really big problem => there are 5149 new words that we have never seen. As we saw in our analysis we can't really generalise on never seen words before.

In [11]:
train_set, train_labels = get_labels(whatsapp_data_train, label=['phonemes'])
test_set, test_labels = get_labels(whatsapp_data_test, label=['phonemes'])

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.phonemes)))
print(len(pd.unique(test_labels.phonemes)))
total_unique_phonemes = len(pd.unique(total_labels.phonemes))
total_unique_phonemes

27369
12655


33990

With phonemes the situation is a bit different, as there are more phonemes and we haven't seen only half of them.

### Tree classifier

First model that we will be trying is tree classifier. In the analysis we have noticed, that there is almost a 1:1 correspondence of trigram of phoneme sizes and words (eg. that for every trigram of phoneme sizes there is different word). 

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [8]:
train_set, train_labels = get_labels(whatsapp_data_train)
test_set, test_labels = get_labels(whatsapp_data_test)

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

In [9]:
total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.words)))
print(len(pd.unique(test_labels.words)))
len(pd.unique(total_labels.words))

16168
6739


21317

In [10]:
lab_enc = LabelEncoder()
lab_enc.fit(total_labels.words)

train_labels = lab_enc.transform(train_labels.words)
test_labels = lab_enc.transform(test_labels.words)
train_labels

array([    0, 15548, 15578, ...,  2531, 18676, 18676])

In [11]:
tree_clf_pipeline = Pipeline(
    [
        (
            "clf",
            DecisionTreeClassifier(criterion="entropy", max_depth=None, splitter="best",
                                   min_samples_split=2, random_state=42),
        ),
    ]
)
# Words: criterion="entropy", max_depth=None, splitter="best", min_samples_split=2, random_state=42 => 0.97, 0.02
# 

In [12]:
print("Starting!")
tree_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.2631
Test accuracy : 0.0380


Phonemes

In [15]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["phonemes"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

array([12858, 13445, 15553, ..., 12837, 14816, 24059])

In [16]:
print("Starting!")
tree_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.2454
Test accuracy : 0.0293


I have been able to run these classificators and the best results I was able to get were around 3%, which isn't that good considering KNN was able to get twice that much.

### KNN

Let's take a look a different kind of classifier => k nearest neighbours. This classifier shouldn't need that much RAM and that much of a computational power.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [10]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(**kwargs)


In [17]:
knn_clf_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(20, weights='distance', n_jobs=4)
        ),
    ]
)

# train accuracy 0.9723
#5   => 0.0313
#10  => 0.0377
#20  => 0.0450
#32  => 0.0497
#64  => 0.0567
#128 => 0.0625
#256 => 0.0668

# uniform gives better test results but doesn't seem to be able to "answer correctly" on the train test
# 64 => 0.0927, 0.0685

In [18]:
print("Starting!")
knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.9723
Test accuracy : 0.0438


For the search space of 64 nearest neighbours we get only 5.67% success rate on our test data (which is around 1971 words). I have listed other parameters and their resulting percentages in the comments in the code cell. Also worth noting is that "StandardScaler" only worsens our predictions.

This whole section is just made as sanity check that we actually get expected results (that is we only guess the words we've already seen and none from which we haven't seen).

Probably remove -----

In [20]:
data_test_copy = whatsapp_data_test.copy()

column_select = list(map(lambda x: x in list(whatsapp_data_train.words.drop_duplicates()), list(data_test_copy.words)))

print("Known words:\t", column_select.count(True))
print("Unknown words:\t", column_select.count(False))

Known words:	 16405
Unknown words:	 15184


In [21]:
data_test_copy = data_test_copy[column_select]

In [22]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(data_test_copy, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(**kwargs)


In [23]:
print("Starting!")
# knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

# 256, distance => 0.9723, 0.1286 on only 

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.0516
Test accuracy : 0.0518


From this we get that the succes rate on known is around double the ammount on all words (this can be seen from the output of a cell 2 cells above).

In [24]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test[list(map(lambda x: not x, column_select))], label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(**kwargs)


In [25]:
print("Starting!")
#knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.9723
Test accuracy : 0.0000


This test was only made as "sanity check" as it is indeed highly probable that our model wouldn't be able to properly guess on never seen examples of words.

To here remove -------

Now let's try our luck with phonemes:

In [26]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["phonemes"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(**kwargs)


In [27]:
knn_clf_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(16, weights='distance', n_jobs=4)
        ),
    ]
)
# 5 => 0.0256
# 6 => 0.0269
# 10 => 0.0299
# 20 => 0.0343
# 32 => 0.0368
# 64 => 0.0404
# 128 => 0.0441
# 256 => 0.0465

In [28]:
print("Starting!")
knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.9723
Test accuracy : 0.0324


We can clearly see that phonemes didn't help us that much and that the results are far worse from those gotten by exploring words.

### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [10]:
rfc_pipeline = Pipeline(
    [
        (
            "clf",
            RandomForestClassifier(max_depth=12, random_state=42, criterion = 'entropy', n_jobs = -1, min_samples_split = 2)
        ),
    ]
)

# (max_depth=12, random_state=42, criterion = 'entropy', n_jobs = -1, min_samples_split = 2) => 0.6181, 0.0651

In [11]:
print("Starting!")
rfc_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {rfc_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {rfc_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.6181
Test accuracy : 0.0651


We can see, that this indeed has better accuracy than normal tree / KNN, but takes way more system resources.

### AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [9]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [10]:
abc_pipeline = Pipeline(
    [
        (
            "clf",
            AdaBoostClassifier(random_state=1, n_estimators = 60, learning_rate=0.9)
        ),
    ]
)


In [1]:
print("Starting!")
#abc_pipeline.fit(train_set, train_labels)
print("Finished!")

#print(f"Train accuracy: {abc_pipeline.score(train_set, train_labels):.4f}")
#print(f"Test accuracy : {abc_pipeline.score(test_set, test_labels):.4f}")

print("0.0505, 0.0471")

0.0505, 0.0471


This classifier ended in absolute failure as it wasn't able to get even acceptable results on the train data. And it even took 8 hours to learn (this is because it can only use 1 thread), so this classifier is pretty much worthless to us.

### MLP Classifier

In [12]:
import keras
import tensorflow as tf

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import classification_report

In [9]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [10]:
print(train_set.shape)
print(train_labels.shape)

(86497, 3)
(86497,)


In [11]:
from keras.utils import to_categorical

train_labels = to_categorical(train_labels, num_classes=total_unique_words)
test_labels = to_categorical(test_labels, num_classes=total_unique_words)
print(train_labels.shape)

(86497, 21317)


In [12]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_words, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 21317)             5478469   
Total params: 5,611,845
Trainable params: 5,611,845
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.fit(train_set, train_labels, epochs=64, batch_size=256)

Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
Epoch 5/1024
Epoch 6/1024
Epoch 7/1024
Epoch 8/1024
Epoch 9/1024
Epoch 10/1024
Epoch 11/1024
Epoch 12/1024
Epoch 13/1024
Epoch 14/1024
Epoch 15/1024
Epoch 16/1024
Epoch 17/1024
Epoch 18/1024
Epoch 19/1024
Epoch 20/1024
Epoch 21/1024
Epoch 22/1024
Epoch 23/1024
Epoch 24/1024
Epoch 25/1024
Epoch 26/1024
Epoch 27/1024
Epoch 28/1024
Epoch 29/1024
Epoch 30/1024
Epoch 31/1024
Epoch 32/1024
Epoch 33/1024
Epoch 34/1024
Epoch 35/1024
Epoch 36/1024
Epoch 37/1024
Epoch 38/1024
Epoch 39/1024
Epoch 40/1024
Epoch 41/1024
Epoch 42/1024
Epoch 43/1024
Epoch 44/1024
Epoch 45/1024
Epoch 46/1024
Epoch 47/1024
Epoch 48/1024
Epoch 49/1024
Epoch 50/1024
Epoch 51/1024
Epoch 52/1024
Epoch 53/1024
Epoch 54/1024
Epoch 55/1024
Epoch 56/1024
Epoch 57/1024
Epoch 58/1024
Epoch 59/1024
Epoch 60/1024
Epoch 61/1024
Epoch 62/1024
Epoch 63/1024
Epoch 64/1024
Epoch 65/1024
Epoch 66/1024
Epoch 67/1024
Epoch 68/1024
Epoch 69/1024
Epoch 70/1024
Epoch 71/1024
Epoch 72/1024
E

Epoch 157/1024
Epoch 158/1024
Epoch 159/1024
Epoch 160/1024
Epoch 161/1024
Epoch 162/1024
Epoch 163/1024
Epoch 164/1024
Epoch 165/1024
Epoch 166/1024
Epoch 167/1024
Epoch 168/1024
Epoch 169/1024
Epoch 170/1024
Epoch 171/1024
Epoch 172/1024
Epoch 173/1024
Epoch 174/1024
Epoch 175/1024
Epoch 176/1024
Epoch 177/1024
Epoch 178/1024
Epoch 179/1024
Epoch 180/1024
Epoch 181/1024
Epoch 182/1024
Epoch 183/1024
Epoch 184/1024
Epoch 185/1024
Epoch 186/1024
Epoch 187/1024
Epoch 188/1024
Epoch 189/1024
Epoch 190/1024
Epoch 191/1024
Epoch 192/1024
Epoch 193/1024
Epoch 194/1024
Epoch 195/1024
Epoch 196/1024
Epoch 197/1024
Epoch 198/1024
Epoch 199/1024
Epoch 200/1024
Epoch 201/1024
Epoch 202/1024
Epoch 203/1024
Epoch 204/1024
Epoch 205/1024
Epoch 206/1024
Epoch 207/1024
Epoch 208/1024
Epoch 209/1024
Epoch 210/1024
Epoch 211/1024
Epoch 212/1024
Epoch 213/1024
Epoch 214/1024
Epoch 215/1024
Epoch 216/1024
Epoch 217/1024
Epoch 218/1024
Epoch 219/1024
Epoch 220/1024
Epoch 221/1024
Epoch 222/1024
Epoch 223/

Epoch 310/1024
Epoch 311/1024
Epoch 312/1024
Epoch 313/1024
Epoch 314/1024
Epoch 315/1024
Epoch 316/1024
Epoch 317/1024
Epoch 318/1024
Epoch 319/1024
Epoch 320/1024
Epoch 321/1024
Epoch 322/1024
Epoch 323/1024
Epoch 324/1024
Epoch 325/1024
Epoch 326/1024
Epoch 327/1024
Epoch 328/1024
Epoch 329/1024
Epoch 330/1024
Epoch 331/1024
Epoch 332/1024
Epoch 333/1024
Epoch 334/1024
Epoch 335/1024
Epoch 336/1024
Epoch 337/1024
Epoch 338/1024
Epoch 339/1024
Epoch 340/1024
Epoch 341/1024
Epoch 342/1024
Epoch 343/1024
Epoch 344/1024
Epoch 345/1024
Epoch 346/1024
Epoch 347/1024
Epoch 348/1024
Epoch 349/1024
Epoch 350/1024
Epoch 351/1024
Epoch 352/1024
Epoch 353/1024
Epoch 354/1024
Epoch 355/1024
Epoch 356/1024
Epoch 357/1024
Epoch 358/1024
Epoch 359/1024
Epoch 360/1024
Epoch 361/1024
Epoch 362/1024
Epoch 363/1024
Epoch 364/1024
Epoch 365/1024
Epoch 366/1024
Epoch 367/1024
Epoch 368/1024
Epoch 369/1024
Epoch 370/1024
Epoch 371/1024
Epoch 372/1024
Epoch 373/1024
Epoch 374/1024
Epoch 375/1024
Epoch 376/

Epoch 465/1024
Epoch 466/1024
Epoch 467/1024
Epoch 468/1024
Epoch 469/1024
Epoch 470/1024
Epoch 471/1024
Epoch 472/1024
Epoch 473/1024
Epoch 474/1024
Epoch 475/1024
Epoch 476/1024
Epoch 477/1024
Epoch 478/1024
Epoch 479/1024
Epoch 480/1024
Epoch 481/1024
Epoch 482/1024
Epoch 483/1024
Epoch 484/1024
Epoch 485/1024
Epoch 486/1024
Epoch 487/1024
Epoch 488/1024
Epoch 489/1024
Epoch 490/1024
Epoch 491/1024
Epoch 492/1024
Epoch 493/1024
Epoch 494/1024
Epoch 495/1024
Epoch 496/1024
Epoch 497/1024
Epoch 498/1024
Epoch 499/1024
Epoch 500/1024
Epoch 501/1024
Epoch 502/1024
Epoch 503/1024
Epoch 504/1024
Epoch 505/1024
Epoch 506/1024
Epoch 507/1024
Epoch 508/1024
Epoch 509/1024
Epoch 510/1024
Epoch 511/1024
Epoch 512/1024
Epoch 513/1024
Epoch 514/1024
Epoch 515/1024
Epoch 516/1024
Epoch 517/1024
Epoch 518/1024
Epoch 519/1024
Epoch 520/1024
Epoch 521/1024
Epoch 522/1024
Epoch 523/1024
Epoch 524/1024
Epoch 525/1024
Epoch 526/1024
Epoch 527/1024
Epoch 528/1024
Epoch 529/1024
Epoch 530/1024
Epoch 531/

Epoch 619/1024
Epoch 620/1024
Epoch 621/1024
Epoch 622/1024
Epoch 623/1024
Epoch 624/1024
Epoch 625/1024
Epoch 626/1024
Epoch 627/1024
Epoch 628/1024
Epoch 629/1024
Epoch 630/1024
Epoch 631/1024
Epoch 632/1024
Epoch 633/1024
Epoch 634/1024
Epoch 635/1024
Epoch 636/1024
Epoch 637/1024
Epoch 638/1024
Epoch 639/1024
Epoch 640/1024
Epoch 641/1024
Epoch 642/1024
Epoch 643/1024
Epoch 644/1024
Epoch 645/1024
Epoch 646/1024
Epoch 647/1024
Epoch 648/1024
Epoch 649/1024
Epoch 650/1024
Epoch 651/1024
Epoch 652/1024
Epoch 653/1024
Epoch 654/1024
Epoch 655/1024
Epoch 656/1024
Epoch 657/1024
Epoch 658/1024
Epoch 659/1024
Epoch 660/1024
Epoch 661/1024
Epoch 662/1024
Epoch 663/1024
Epoch 664/1024
Epoch 665/1024
Epoch 666/1024
Epoch 667/1024
Epoch 668/1024
Epoch 669/1024
Epoch 670/1024
Epoch 671/1024
Epoch 672/1024
Epoch 673/1024
Epoch 674/1024
Epoch 675/1024
Epoch 676/1024
Epoch 677/1024
Epoch 678/1024
Epoch 679/1024
Epoch 680/1024
Epoch 681/1024
Epoch 682/1024
Epoch 683/1024
Epoch 684/1024
Epoch 685/

Epoch 774/1024
Epoch 775/1024
Epoch 776/1024
Epoch 777/1024
Epoch 778/1024
Epoch 779/1024
Epoch 780/1024
Epoch 781/1024
Epoch 782/1024
Epoch 783/1024
Epoch 784/1024
Epoch 785/1024
Epoch 786/1024
Epoch 787/1024
Epoch 788/1024
Epoch 789/1024
Epoch 790/1024
Epoch 791/1024
Epoch 792/1024
Epoch 793/1024
Epoch 794/1024
Epoch 795/1024
Epoch 796/1024
Epoch 797/1024
Epoch 798/1024
Epoch 799/1024
Epoch 800/1024
Epoch 801/1024
Epoch 802/1024
Epoch 803/1024
Epoch 804/1024
Epoch 805/1024
Epoch 806/1024
Epoch 807/1024
Epoch 808/1024
Epoch 809/1024
Epoch 810/1024
Epoch 811/1024
Epoch 812/1024
Epoch 813/1024
Epoch 814/1024
Epoch 815/1024
Epoch 816/1024
Epoch 817/1024
Epoch 818/1024
Epoch 819/1024
Epoch 820/1024
Epoch 821/1024
Epoch 822/1024
Epoch 823/1024
Epoch 824/1024
Epoch 825/1024
Epoch 826/1024
Epoch 827/1024
Epoch 828/1024
Epoch 829/1024
Epoch 830/1024
Epoch 831/1024
Epoch 832/1024
Epoch 833/1024
Epoch 834/1024
Epoch 835/1024
Epoch 836/1024
Epoch 837/1024
Epoch 838/1024
Epoch 839/1024
Epoch 840/

Epoch 928/1024
Epoch 929/1024
Epoch 930/1024
Epoch 931/1024
Epoch 932/1024
Epoch 933/1024
Epoch 934/1024
Epoch 935/1024
Epoch 936/1024
Epoch 937/1024
Epoch 938/1024
Epoch 939/1024
Epoch 940/1024
Epoch 941/1024
Epoch 942/1024
Epoch 943/1024
Epoch 944/1024
Epoch 945/1024
Epoch 946/1024
Epoch 947/1024
Epoch 948/1024
Epoch 949/1024
Epoch 950/1024
Epoch 951/1024
Epoch 952/1024
Epoch 953/1024
Epoch 954/1024
Epoch 955/1024
Epoch 956/1024
Epoch 957/1024
Epoch 958/1024
Epoch 959/1024
Epoch 960/1024
Epoch 961/1024
Epoch 962/1024
Epoch 963/1024
Epoch 964/1024
Epoch 965/1024
Epoch 966/1024
Epoch 967/1024
Epoch 968/1024
Epoch 969/1024
Epoch 970/1024
Epoch 971/1024
Epoch 972/1024
Epoch 973/1024
Epoch 974/1024
Epoch 975/1024
Epoch 976/1024
Epoch 977/1024
Epoch 978/1024
Epoch 979/1024
Epoch 980/1024
Epoch 981/1024
Epoch 982/1024
Epoch 983/1024
Epoch 984/1024
Epoch 985/1024
Epoch 986/1024
Epoch 987/1024
Epoch 988/1024
Epoch 989/1024
Epoch 990/1024
Epoch 991/1024
Epoch 992/1024
Epoch 993/1024
Epoch 994/

Epoch 1005/1024
Epoch 1006/1024
Epoch 1007/1024
Epoch 1008/1024
Epoch 1009/1024
Epoch 1010/1024
Epoch 1011/1024
Epoch 1012/1024
Epoch 1013/1024
Epoch 1014/1024
Epoch 1015/1024
Epoch 1016/1024
Epoch 1017/1024
Epoch 1018/1024
Epoch 1019/1024
Epoch 1020/1024
Epoch 1021/1024
Epoch 1022/1024
Epoch 1023/1024
Epoch 1024/1024


<tensorflow.python.keras.callbacks.History at 0x7f62b198c048>

In [15]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [28.130123138427734, 0.07205040007829666]


batch size = 128:  
test loss, test acc: [15.796355247497559, 0.0679350420832634] => 50 epochs  
test loss, test acc: [18.180967330932617, 0.06521257758140564] => 100 epochs

batch size = 256:  
test loss, test acc: [21.200485229492188, 0.071987085044384] => 256 epochs
test loss, test acc: [28.130123138427734, 0.07205040007829666] => 1024 epochs

In [16]:
"""
pred_y = model.predict(test_set)
print(len(pred_y))
print(pred_y[0])

pred_y_labels = [0]*len(pred_y)
for i in range(len(pred_y)):
    pred_y_labels[i] = np.argmax(pred_y[i])
    
print(pred_y_labels[0])

print(classification_report(test_labels, pred_y_labels))
"""
print("Not used")

#### Phonemes

In [14]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["phonemes"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [15]:
print(train_set.shape)
print(train_labels.shape)

(86497, 3)
(86497,)


In [16]:
train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(86497, 33990)


In [20]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_phonemes, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 512)               2048      
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_5 (Dense)              (None, 33990)             8735430   
Total params: 8,868,806
Trainable params: 8,868,806
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.fit(train_set, train_labels, epochs=128, batch_size=256)

Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128
Epoch 22/128
Epoch 23/128
Epoch 24/128
Epoch 25/128
Epoch 26/128
Epoch 27/128
Epoch 28/128
Epoch 29/128
Epoch 30/128
Epoch 31/128
Epoch 32/128
Epoch 33/128
Epoch 34/128
Epoch 35/128
Epoch 36/128
Epoch 37/128
Epoch 38/128
Epoch 39/128
Epoch 40/128
Epoch 41/128
Epoch 42/128
Epoch 43/128
Epoch 44/128
Epoch 45/128
Epoch 46/128
Epoch 47/128
Epoch 48/128
Epoch 49/128
Epoch 50/128
Epoch 51/128
Epoch 52/128
Epoch 53/128
Epoch 54/128
Epoch 55/128
Epoch 56/128
Epoch 57/128
Epoch 58/128
Epoch 59/128
Epoch 60/128
Epoch 61/128
Epoch 62/128
Epoch 63/128
Epoch 64/128
Epoch 65/128
Epoch 66/128
Epoch 67/128
Epoch 68/128
Epoch 69/128
Epoch 70/128
Epoch 71/128
Epoch 72/128
Epoch 73/128
Epoch 74/128
Epoch 75/128
Epoch 76/128
Epoch 77/128
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f18668f1eb8>

In [22]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [20.68207550048828, 0.049479249864816666]


batch_size 256:  
test loss, test acc: [9.716291427612305, 0.04188166931271553] => 4 epochs  
test loss, test acc: [20.68207550048828, 0.049479249864816666] => 128 epochs

### LSTM