# Replication of attack on VoIP end-to-end encrypted messengers

## Models

### Loading and preprocessing

We will now try to explore various models on `Whatsapp` dataset. Bellow we will find loading and preprocessing that we have come up with in the analysis section.

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

sns.set()  # make plots nicer

np.random.seed(42)  # set seed for reproducibility

In [3]:
def file_parser_with_prev_next(path):
    file = open(path, 'r')
    lines = file.readlines()
    
    file_name = [path.split('/')[-1]]
    sentence = ""
    file_data = []
    
    has_value = False
    previous = 0
    
    for line in lines:
        line = line.strip()
        
        # if there are only 2 informations on line and second is h#, then ignore
        # strip line, split primarly on ; secondary on ,
        if (line.startswith('#')):
            if (not sentence):
                sentence = line[len('# Sentence: "'): len(line) - 1]
            continue
        
        line = line.split(';')
        
        if (len(line) == 1):
            #lines containing only their packet size and nothing else, they should be added
            #TODO
            line += [""]
            line += [""]
            #continue
        
        if (len(line) == 2):
            #this tries to remove most of the silence at the start of the recording
            #potentionally harmfull as we shouldn't clean test data this way (we will be reading labels)
            #if (line[1] == 'h#'):
            #    continue
            line += [""]
        
        line[1] = tuple(line[1].split(','))
        line[2] = tuple(list(map(lambda a: a.strip('"'), line[2].split(','))))
        
        if (has_value):
            file_data[-1][4] = line[0]
           
        # file_type and sentence contain duplicate informations, but are kept for readability
        line = file_name + [file_name[0][0:9]] + [sentence] + [previous] + [0] + line
        #adding previous as feature
        previous = line[5]
        file_data += [line]
        
        #adding next frame as feature
        has_value = True
        
        

    return pd.DataFrame(file_data, columns=['file', 'speaker', 'sentence', 'previous_packet', 'next_packet','packet_size', 'phonemes', 'words'])

def load_files_with_prev_next(directory):
    filelist = os.listdir(directory)
    #read them into pandas
    df_list = [file_parser_with_prev_next(directory+file) for file in filelist]
    #concatenate them together
    return pd.concat(df_list, ignore_index=True)

In [4]:
def convert_types(data_frame):
    data_frame['packet_size'] = pd.to_numeric(data_frame['packet_size'])
    data_frame['previous_packet'] = pd.to_numeric(data_frame['previous_packet'])
    data_frame['next_packet'] = pd.to_numeric(data_frame['next_packet'])

    data_frame['file'] = data_frame['file'].astype('category')
    data_frame['sentence'] = data_frame['sentence'].astype('category')
    data_frame['speaker'] = data_frame['speaker'].astype('category')

In [5]:
whatsapp_data_train = load_files_with_prev_next("./../data/whatsapp_train_data/")
whatsapp_data_test = load_files_with_prev_next("./../data/whatsapp_test_data/")
convert_types(whatsapp_data_train)
convert_types(whatsapp_data_test)
whatsapp_data_test

Unnamed: 0,file,speaker,sentence,previous_packet,next_packet,packet_size,phonemes,words
0,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,0,342,249,"(h#,)","(,)"
1,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,249,335,342,"(h#,)","(,)"
2,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,342,303,335,"(h#,)","(,)"
3,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,335,364,303,"(h#, sh)","(she,)"
4,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,303,418,364,"(sh, iy, hv)","(she, had)"
...,...,...,...,...,...,...,...,...
31584,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,338,370,303,"(r, ao, r)","(our, oriental)"
31585,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,303,314,370,"(r, iy, eh)","(oriental,)"
31586,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,370,303,314,"(eh, n, tcl, t)","(oriental,)"
31587,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,314,295,303,"(t, el, r, ah)","(oriental, rug)"


In [6]:
def add_surrounding(data_frame):
    data_frame['prev_curr'] = list(zip(data_frame.previous_packet, data_frame.packet_size))
    data_frame['next_curr'] = list(zip(data_frame.next_packet, data_frame.packet_size))
    data_frame['packet_surrounding'] = list(zip(data_frame.previous_packet, data_frame.packet_size, data_frame.next_packet))
    
    #data_frame['prev_curr'] = data_frame['prev_curr'].astype('category')
    #data_frame['next_curr'] = data_frame['next_curr'].astype('category')
    #data_frame['packet_surrounding'] = data_frame['packet_surrounding'].astype('category')

add_surrounding(whatsapp_data_train)
add_surrounding(whatsapp_data_test)

whatsapp_data_train = whatsapp_data_train[['file', 'speaker', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
whatsapp_data_test = whatsapp_data_test[['file', 'speaker', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
whatsapp_data_train

Unnamed: 0,file,speaker,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,0,380,227,"(0, 227)","(380, 227)","(0, 227, 380)","(h#,)","(,)"
1,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,227,407,380,"(227, 380)","(407, 380)","(227, 380, 407)","(h#, sh, ix)","(she,)"
2,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,380,350,407,"(380, 407)","(350, 407)","(380, 407, 350)","(ix, hv, eh)","(she, had)"
3,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,407,281,350,"(407, 350)","(281, 350)","(407, 350, 281)","(eh, dcl, jh)","(had, your)"
4,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,350,327,281,"(350, 281)","(327, 281)","(350, 281, 327)","(jh, ih, dcl, d, ah)","(had, your, dark)"
...,...,...,...,...,...,...,...,...,...,...,...
86492,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,286,253,268,"(286, 268)","(253, 268)","(286, 268, 253)","(ay, bcl, b, ih)","(by, big)"
86493,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,268,315,253,"(268, 253)","(315, 253)","(268, 253, 315)","(ih, gcl)","(big,)"
86494,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,253,279,315,"(253, 315)","(279, 315)","(253, 315, 279)","(gcl, t, ih)","(big, tips)"
86495,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,315,392,279,"(315, 279)","(392, 279)","(315, 279, 392)","(ih, pcl, p)","(tips,)"


### Preparing data

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [8]:
# add removal of labels for the test_dataset
def get_labels(df, label=["words"], feature=["previous_packet", "packet_size", "next_packet"]):
    labels = df.loc[:, label]
    features = df.loc[:, feature]
    return features, labels

In [9]:
def prepare_labels(train_labels, test_labels, label=["words"]):
    train_labels = train_labels.astype('category')
    test_labels = test_labels.astype('category')
    
    total_labels = train_labels.append(test_labels)
    
    lab_enc = LabelEncoder()
    lab_enc.fit(total_labels[label])

    train_labels = lab_enc.transform(train_labels[label])
    test_labels = lab_enc.transform(test_labels[label])
    
    return train_labels, test_labels, lab_enc

In [10]:
train_set, train_labels = get_labels(whatsapp_data_train)
test_set, test_labels = get_labels(whatsapp_data_test)

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.words)))
print(len(pd.unique(test_labels.words)))
total_unique_words = len(pd.unique(total_labels.words))
total_unique_words

16168
6739


21317

Now we can see that we have a really big problem => there are 5149 new words that we have never seen. As we saw in our analysis we can't really generalise on never seen words before.

In [11]:
train_set, train_labels = get_labels(whatsapp_data_train, label=['phonemes'])
test_set, test_labels = get_labels(whatsapp_data_test, label=['phonemes'])

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.phonemes)))
print(len(pd.unique(test_labels.phonemes)))
total_unique_phonemes = len(pd.unique(total_labels.phonemes))
total_unique_phonemes

27369
12655


33990

With phonemes the situation is a bit different, as there are more phonemes and we haven't seen only half of them.

### Tree classifier

First model that we will be trying is tree classifier. In the analysis we have noticed, that there is almost a 1:1 correspondence of trigram of phoneme sizes and words (eg. that for every trigram of phoneme sizes there is different word). 

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [8]:
train_set, train_labels = get_labels(whatsapp_data_train)
test_set, test_labels = get_labels(whatsapp_data_test)

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

In [9]:
total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.words)))
print(len(pd.unique(test_labels.words)))
len(pd.unique(total_labels.words))

16168
6739


21317

In [10]:
lab_enc = LabelEncoder()
lab_enc.fit(total_labels.words)

train_labels = lab_enc.transform(train_labels.words)
test_labels = lab_enc.transform(test_labels.words)
train_labels

array([    0, 15548, 15578, ...,  2531, 18676, 18676])

In [11]:
tree_clf_pipeline = Pipeline(
    [
        (
            "clf",
            DecisionTreeClassifier(criterion="entropy", max_depth=None, splitter="best",
                                   min_samples_split=2, random_state=42),
        ),
    ]
)
# Words: criterion="entropy", max_depth=None, splitter="best", min_samples_split=2, random_state=42 => 0.97, 0.02
# 

In [12]:
print("Starting!")
tree_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.2631
Test accuracy : 0.0380


Phonemes

In [15]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["phonemes"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

array([12858, 13445, 15553, ..., 12837, 14816, 24059])

In [16]:
print("Starting!")
tree_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.2454
Test accuracy : 0.0293


I have been able to run these classificators and the best results I was able to get were around 3%, which isn't that good considering KNN was able to get twice that much.

### KNN

Let's take a look a different kind of classifier => k nearest neighbours. This classifier shouldn't need that much RAM and that much of a computational power.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [10]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(**kwargs)


In [17]:
knn_clf_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(20, weights='distance', n_jobs=4)
        ),
    ]
)

# train accuracy 0.9723
#5   => 0.0313
#10  => 0.0377
#20  => 0.0450
#32  => 0.0497
#64  => 0.0567
#128 => 0.0625
#256 => 0.0668

# uniform gives better test results but doesn't seem to be able to "answer correctly" on the train test
# 64 => 0.0927, 0.0685

In [18]:
print("Starting!")
knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.9723
Test accuracy : 0.0438


For the search space of 64 nearest neighbours we get only 5.67% success rate on our test data (which is around 1971 words). I have listed other parameters and their resulting percentages in the comments in the code cell. Also worth noting is that "StandardScaler" only worsens our predictions.

This whole section is just made as sanity check that we actually get expected results (that is we only guess the words we've already seen and none from which we haven't seen).

Probably remove -----

In [20]:
data_test_copy = whatsapp_data_test.copy()

column_select = list(map(lambda x: x in list(whatsapp_data_train.words.drop_duplicates()), list(data_test_copy.words)))

print("Known words:\t", column_select.count(True))
print("Unknown words:\t", column_select.count(False))

Known words:	 16405
Unknown words:	 15184


In [21]:
data_test_copy = data_test_copy[column_select]

In [22]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(data_test_copy, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(**kwargs)


In [23]:
print("Starting!")
# knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

# 256, distance => 0.9723, 0.1286 on only 

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.0516
Test accuracy : 0.0518


From this we get that the succes rate on known is around double the ammount on all words (this can be seen from the output of a cell 2 cells above).

In [24]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test[list(map(lambda x: not x, column_select))], label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(**kwargs)


In [25]:
print("Starting!")
#knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.9723
Test accuracy : 0.0000


This test was only made as "sanity check" as it is indeed highly probable that our model wouldn't be able to properly guess on never seen examples of words.

To here remove -------

Now let's try our luck with phonemes:

In [26]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["phonemes"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(**kwargs)


In [27]:
knn_clf_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(16, weights='distance', n_jobs=4)
        ),
    ]
)
# 5 => 0.0256
# 6 => 0.0269
# 10 => 0.0299
# 20 => 0.0343
# 32 => 0.0368
# 64 => 0.0404
# 128 => 0.0441
# 256 => 0.0465

In [28]:
print("Starting!")
knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.9723
Test accuracy : 0.0324


We can clearly see that phonemes didn't help us that much and that the results are far worse from those gotten by exploring words.

### Random forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [10]:
rfc_pipeline = Pipeline(
    [
        (
            "clf",
            RandomForestClassifier(max_depth=12, random_state=42, criterion = 'entropy', n_jobs = -1, min_samples_split = 2)
        ),
    ]
)

# (max_depth=12, random_state=42, criterion = 'entropy', n_jobs = -1, min_samples_split = 2) => 0.6181, 0.0651

In [11]:
print("Starting!")
rfc_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {rfc_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {rfc_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.6181
Test accuracy : 0.0651


We can see, that this indeed has better accuracy than normal tree / KNN, but takes way more system resources.

In [15]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["phonemes"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [16]:
rfc_pipeline = Pipeline(
    [
        (
            "clf",
            RandomForestClassifier(max_depth=8, random_state=42, criterion = 'entropy', n_jobs = -1, min_samples_split = 2)
        ),
    ]
)

In [17]:
print("Starting!")
rfc_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {rfc_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {rfc_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!


MemoryError: Unable to allocate 17.6 GiB for an array with shape (86497, 27369) and data type float64

### AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [9]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [10]:
abc_pipeline = Pipeline(
    [
        (
            "clf",
            AdaBoostClassifier(random_state=1, n_estimators = 60, learning_rate=0.9)
        ),
    ]
)


In [1]:
print("Starting!")
#abc_pipeline.fit(train_set, train_labels)
print("Finished!")

#print(f"Train accuracy: {abc_pipeline.score(train_set, train_labels):.4f}")
#print(f"Test accuracy : {abc_pipeline.score(test_set, test_labels):.4f}")

print("0.0505, 0.0471")

0.0505, 0.0471


This classifier ended in absolute failure as it wasn't able to get even acceptable results on the train data. And it even took 8 hours to learn (this is because it can only use 1 thread), so this classifier is pretty much worthless to us.

### MLP Classifier

In [12]:
import keras
import tensorflow as tf

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import classification_report

In [20]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [21]:
print(train_set.shape)
print(train_labels.shape)

(86497, 3)
(86497,)


In [22]:
from keras.utils import to_categorical

train_labels = to_categorical(train_labels, num_classes=total_unique_words)
test_labels = to_categorical(test_labels, num_classes=total_unique_words)
print(train_labels.shape)

(86497, 21317)


In [23]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_words, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 21317)             5478469   
Total params: 5,611,845
Trainable params: 5,611,845
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.fit(train_set, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7f448eca4a90>

In [25]:
print("train loss, train acc:", model.evaluate(train_set, train_labels))

train loss, train acc: [6.8337321281433105, 0.06833762675523758]


In [26]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [17.056943893432617, 0.06476937979459763]


batch size = 128:  
test loss, test acc: [15.796355247497559, 0.0679350420832634] => 50 epochs  
test loss, test acc: [18.180967330932617, 0.06521257758140564] => 100 epochs

batch size = 256:  
test loss, test acc: [21.200485229492188, 0.071987085044384] => 256 epochs
test loss, test acc: [28.130123138427734, 0.07205040007829666] => 1024 epochs

#### Phonemes

In [12]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["phonemes"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [13]:
print(train_set.shape)
print(train_labels.shape)

(86497, 3)
(86497,)


In [14]:
train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(86497, 33990)


In [15]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_phonemes, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 33990)             8735430   
Total params: 8,868,806
Trainable params: 8,868,806
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.fit(train_set, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7f9ff8380f98>

In [17]:
print("train loss, train acc:", model.evaluate(train_set, train_labels))

train loss, train acc: [6.943676471710205, 0.052868884056806564]


In [18]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [16.903888702392578, 0.0468517504632473]


batch_size 256:  
test loss, test acc: [9.716291427612305, 0.04188166931271553] => 4 epochs  
test loss, test acc: [20.68207550048828, 0.049479249864816666] => 128 epochs

### LSTM

In [13]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [15]:
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import SpatialDropout1D

In [22]:
#more elaborate model
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_words, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               266240    
_________________________________________________________________
dense_4 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 21317)             5478469   
Total params: 5,810,501
Trainable params: 5,810,501
Non-trainable params: 0
_________________________________________________________________


In [23]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["words"])

  return f(*args, **kwargs)


In [24]:
from keras.utils import to_categorical

train_labels = to_categorical(train_labels, num_classes=total_unique_words)
test_labels = to_categorical(test_labels, num_classes=total_unique_words)
print(train_labels.shape)

(86497, 21317)


In [25]:
train_set.values.reshape(-1,1,3).shape

(86497, 1, 3)

In [26]:
reshaped_values = train_set.values.reshape(-1, 1, 3)
reshaped_values[0][0]

array([  0, 227, 380])

In [27]:
model_lstm.fit(reshaped_values, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7f9ff7fdda58>

In [28]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 3), test_labels))

test loss, test acc: [14.57046890258789, 0.05498749390244484]


In [29]:
print("train loss, train acc:", model_lstm.evaluate(reshaped_values, train_labels))

train loss, train acc: [7.088553428649902, 0.05795576795935631]


The results are really underwhelming and I have no clue why.

#### Phonemes

In [16]:
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import SpatialDropout1D

In [24]:
#more elaborate model
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_phonemes, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 256)               266240    
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 33990)             8735430   
Total params: 9,067,462
Trainable params: 9,067,462
Non-trainable params: 0
_________________________________________________________________


In [25]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["phonemes"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

In [26]:
from keras.utils import to_categorical

train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(86497, 33990)


In [27]:
reshaped_values = train_set.values.reshape(-1, 1, 3)
reshaped_values[0][0]

array([  0, 227, 380])

In [28]:
model_lstm.fit(reshaped_values, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7fb182f05400>

In [29]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 3), test_labels))

test loss, test acc: [12.568110466003418, 0.0433378703892231]


In [30]:
print("train loss, train acc:", model_lstm.evaluate(reshaped_values, train_labels))

train loss, train acc: [7.414408206939697, 0.048001665621995926]
