# Replication of attack on VoIP end-to-end encrypted messengers

## Models

### Loading and preprocessing

We will now try to explore various models on `skype` dataset. Bellow we will find loading and preprocessing that we have come up with in the analysis section.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

sns.set()  # make plots nicer

np.random.seed(42)  # set seed for reproducibility

In [2]:
def file_parser_with_prev_next(path):
    file = open(path, 'r')
    lines = file.readlines()
    
    file_name = [path.split('/')[-1]]
    sentence = ""
    file_data = []
    
    has_value = False
    previous = 0
    
    for line in lines:
        line = line.strip()
        
        # if there are only 2 informations on line and second is h#, then ignore
        # strip line, split primarly on ; secondary on ,
        if (line.startswith('#')):
            if (not sentence):
                sentence = line[len('# Sentence: "'): len(line) - 1]
            continue
        
        line = line.split(';')
        
        if (len(line) == 1):
            #lines containing only their packet size and nothing else, they should be added
            #TODO
            line += [""]
            line += [""]
            #continue
        
        if (len(line) == 2):
            #this tries to remove most of the silence at the start of the recording
            #potentionally harmfull as we shouldn't clean test data this way (we will be reading labels)
            #if (line[1] == 'h#'):
            #    continue
            line += [""]
        
        line[1] = tuple(line[1].split(','))
        line[2] = tuple(list(map(lambda a: a.strip('"'), line[2].split(','))))
        
        if (has_value):
            file_data[-1][4] = line[0]
           
        # file_type and sentence contain duplicate informations, but are kept for readability
        line = file_name + [file_name[0][0:9]] + [sentence] + [previous] + [0] + line
        #adding previous as feature
        previous = line[5]
        file_data += [line]
        
        #adding next frame as feature
        has_value = True
        
        

    return pd.DataFrame(file_data, columns=['file', 'speaker', 'sentence', 'previous_packet', 'next_packet','packet_size', 'phonemes', 'words'])

def load_files_with_prev_next(directory):
    filelist = os.listdir(directory)
    #read them into pandas
    df_list = [file_parser_with_prev_next(directory+file) for file in filelist]
    #concatenate them together
    return pd.concat(df_list, ignore_index=True)

In [3]:
def convert_types(data_frame):
    data_frame['packet_size'] = pd.to_numeric(data_frame['packet_size'])
    data_frame['previous_packet'] = pd.to_numeric(data_frame['previous_packet'])
    data_frame['next_packet'] = pd.to_numeric(data_frame['next_packet'])

    data_frame['file'] = data_frame['file'].astype('category')
    data_frame['sentence'] = data_frame['sentence'].astype('category')
    data_frame['speaker'] = data_frame['speaker'].astype('category')

In [4]:
skype_data_train = load_files_with_prev_next("./../data/skype_train_data/")
skype_data_test = load_files_with_prev_next("./../data/skype_test_data/")
convert_types(skype_data_train)
convert_types(skype_data_test)
skype_data_test

Unnamed: 0,file,speaker,sentence,previous_packet,next_packet,packet_size,phonemes,words
0,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,0,35,30,"(h#,)","(,)"
1,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,30,43,35,"(h#,)","(,)"
2,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,35,26,43,"(h#,)","(,)"
3,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,43,30,26,"(h#,)","(,)"
4,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,26,31,30,"(h#,)","(,)"
...,...,...,...,...,...,...,...,...
258516,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,40,43,46,"(h#,)","(,)"
258517,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,46,41,43,"(h#,)","(,)"
258518,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,43,34,41,"(h#,)","(,)"
258519,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,41,33,34,"(h#,)","(,)"


In [5]:
def add_surrounding(data_frame):
    data_frame['prev_curr'] = list(zip(data_frame.previous_packet, data_frame.packet_size))
    data_frame['next_curr'] = list(zip(data_frame.next_packet, data_frame.packet_size))
    data_frame['packet_surrounding'] = list(zip(data_frame.previous_packet, data_frame.packet_size, data_frame.next_packet))
    
    #data_frame['prev_curr'] = data_frame['prev_curr'].astype('category')
    #data_frame['next_curr'] = data_frame['next_curr'].astype('category')
    #data_frame['packet_surrounding'] = data_frame['packet_surrounding'].astype('category')

add_surrounding(skype_data_train)
add_surrounding(skype_data_test)

skype_data_train = skype_data_train[['file', 'speaker', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
skype_data_test = skype_data_test[['file', 'speaker', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
skype_data_train

Unnamed: 0,file,speaker,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,0,32,32,"(0, 32)","(32, 32)","(0, 32, 32)","(h#,)","(,)"
1,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,32,31,32,"(32, 32)","(31, 32)","(32, 32, 31)","(h#,)","(,)"
2,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,32,28,31,"(32, 31)","(28, 31)","(32, 31, 28)","(h#,)","(,)"
3,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,31,28,28,"(31, 28)","(28, 28)","(31, 28, 28)","(h#,)","(,)"
4,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,28,36,28,"(28, 28)","(36, 28)","(28, 28, 36)","(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...,...
707433,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,47,34,32,"(47, 32)","(34, 32)","(47, 32, 34)","(h#,)","(,)"
707434,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,32,39,34,"(32, 34)","(39, 34)","(32, 34, 39)","(h#,)","(,)"
707435,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,34,33,39,"(34, 39)","(33, 39)","(34, 39, 33)","(h#,)","(,)"
707436,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,39,36,33,"(39, 33)","(36, 33)","(39, 33, 36)","(h#,)","(,)"


### Preparing data

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [7]:
# add removal of labels for the test_dataset
def get_labels(df, label=["words"], feature=["previous_packet", "packet_size", "next_packet"]):
    labels = df[label]
    features = df[feature]
    return features, labels

In [8]:
def prepare_labels(train_labels, test_labels, label=["words"]):
    train_labels = train_labels.astype('category')
    test_labels = test_labels.astype('category')
    
    total_labels = train_labels.append(test_labels)
    
    lab_enc = LabelEncoder()
    lab_enc.fit(total_labels[label])

    train_labels = lab_enc.transform(train_labels[label])
    test_labels = lab_enc.transform(test_labels[label])
    
    return train_labels, test_labels, lab_enc

In [11]:
train_set, train_labels = get_labels(skype_data_train)
test_set, test_labels = get_labels(skype_data_test)

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.words)))
print(len(pd.unique(test_labels.words)))
total_unique_words = len(pd.unique(total_labels.words))
total_unique_words

15713
6626


20568

Now we can see that we have a really big problem => there are 4855 new words that we have never seen. As we saw in our analysis we can't really generalise on never seen words before => this will hinder our results

In [12]:
train_set, train_labels = get_labels(skype_data_train, label=['phonemes'])
test_set, test_labels = get_labels(skype_data_test, label=['phonemes'])

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.phonemes)))
print(len(pd.unique(test_labels.phonemes)))
total_unique_phonemes = len(pd.unique(total_labels.phonemes))
total_unique_phonemes

3083
2363


3281

But this is really promissing. There are only 198 new phonemes, that we haven't seen.

### Tree classifier

First model that we will be trying is tree classifier. 

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [9]:
train_set, train_labels = get_labels(skype_data_train, label=["words"])
test_set, test_labels = get_labels(skype_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

In [12]:
tree_clf_pipeline = Pipeline(
    [
        (
            "clf",
            DecisionTreeClassifier(criterion="entropy", max_depth=None, splitter="best",
                                   min_samples_split=2, random_state=42),
        ),
    ]
)

In [13]:
print("Starting!")
tree_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.2925
Test accuracy : 0.1195


Phonemes

In [14]:
train_set, train_labels = get_labels(skype_data_train, label=["phonemes"])
test_set, test_labels = get_labels(skype_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

In [17]:
print("Starting!")
tree_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.3282
Test accuracy : 0.1399


This is already quite a promissing success rate for "just a simple" tree classifier. Also as we can see, `phonemes` give better results in skype dataset.

### KNN

Let's take a look a different kind of classifier => k nearest neighbours. This classifier shouldn't need that much RAM and that much of a computational power.

In [14]:
from sklearn.neighbors import KNeighborsClassifier

In [41]:
train_set, train_labels = get_labels(skype_data_train, label=["words"])
test_set, test_labels = get_labels(skype_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [42]:
knn_clf_pipeline = Pipeline(
    [
        (
            "scaler",
            StandardScaler()
        ),
        (
            "clf",
            KNeighborsClassifier(32, weights='distance', n_jobs=-1)
        ),
    ]
)

# 20, distance => 0.2887, 0.1203
# 32, uniform => 0.1700, 0.1343
# 32, distance => 0.2912, 0.1216

In [43]:
print("Starting!")
knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.2911
Test accuracy : 0.1213


For the search space of 32 nearest neighbours we get around 12% success rate on our test data (which is around 31436 words). I have listed other parameters and their resulting percentages in the comments in the code cell. Also worth noting is that "StandardScaler" only worsens (not tested on skype) our predictions.

Now let's try our luck with phonemes:

In [24]:
train_set, train_labels = get_labels(skype_data_train, label=["phonemes"])
test_set, test_labels = get_labels(skype_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [25]:
knn_clf_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(32, weights='distance', n_jobs=-1)
        ),
    ]
)
# 20, distance => 0.3221, 0.1377
# 32, uniform => 0.2093, 0.1574
# 32, distance => 0.3265, 0.1410

In [26]:
print("Starting!")
knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.3265
Test accuracy : 0.1410


We can clearly see that phonemes are indeed actually better than just words and help us get better predictions. But of course there is also adds the complication of how to make words from these phonemes / make something, that makes sense.

### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
train_set, train_labels = get_labels(skype_data_train, label=["phonemes"])
test_set, test_labels = get_labels(skype_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [28]:
rfc_pipeline = Pipeline(
    [
        (
            "clf",
            RandomForestClassifier(max_depth=12, random_state=42, criterion = 'entropy', n_jobs = -1, min_samples_split = 2)
        ),
    ]
)

In [None]:
print("Starting!")
rfc_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {rfc_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {rfc_pipeline.score(test_set, test_labels):.4f}")

RAM :(

### AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [9]:
train_set, train_labels = get_labels(skype_data_train, label=["words"])
test_set, test_labels = get_labels(skype_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [10]:
abc_pipeline = Pipeline(
    [
        (
            "clf",
            AdaBoostClassifier(random_state=1, n_estimators = 60, learning_rate=0.9)
        ),
    ]
)


In [1]:
print("Starting!")
#abc_pipeline.fit(train_set, train_labels)
print("Finished!")

#print(f"Train accuracy: {abc_pipeline.score(train_set, train_labels):.4f}")
#print(f"Test accuracy : {abc_pipeline.score(test_set, test_labels):.4f}")

print("0.0505, 0.0471")

0.0505, 0.0471


This classifier ended in absolute failure as it wasn't able to get even acceptable results on the train data. And it even took 8 hours to learn (this is because it can only use 1 thread), so this classifier is pretty much worthless to us.

### MLP Classifier

Now let's now try to bring out the big guns - neural networks. For this I've chosen to use TensorFlow and Keras (PyTorch could also be used). We are able to get reasonably better results but at the cost of long compute times.

In [27]:
import keras
import tensorflow as tf

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import classification_report

In [28]:
train_set, train_labels = get_labels(skype_data_train, label=["words"])
test_set, test_labels = get_labels(skype_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [29]:
print(train_set.shape)
print(train_labels.shape)

(707438, 3)
(707438,)


In [30]:
train_labels = to_categorical(train_labels, num_classes=total_unique_words)
test_labels = to_categorical(test_labels, num_classes=total_unique_words)
print(train_labels.shape)

(707438, 20568)


In [31]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_words, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 20568)             5285976   
Total params: 5,419,352
Trainable params: 5,419,352
Non-trainable params: 0
_________________________________________________________________


In [32]:
model.fit(train_set, train_labels, epochs=4, batch_size=256)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f6d28076080>

In [33]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [9.740732192993164, 0.14129993319511414]


batch size = 128:  
test loss, test acc: [12.597193717956543, 0.1413850337266922] => 128 epochs  

batch size = 256:  
test loss, test acc: [9.740732192993164, 0.14129993319511414] => 4 epochs

In [34]:
"""
pred_y = model.predict(test_set)
print(len(pred_y))
print(pred_y[0])

pred_y_labels = [0]*len(pred_y)
for i in range(len(pred_y)):
    pred_y_labels[i] = np.argmax(pred_y[i])
    
print(pred_y_labels[0])

print(classification_report(test_labels, pred_y_labels))
"""
print("Not used")

Not used


We can see that changing epoch count doesn't change the results that much and we should try to explore different models / architectures.

#### Phonemes

In [35]:
train_set, train_labels = get_labels(skype_data_train, label=["phonemes"])
test_set, test_labels = get_labels(skype_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [36]:
print(train_set.shape)
print(train_labels.shape)

(707438, 3)
(707438,)


In [37]:
train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(707438, 3281)


In [38]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_phonemes, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 512)               2048      
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_5 (Dense)              (None, 3281)              843217    
Total params: 976,593
Trainable params: 976,593
Non-trainable params: 0
_________________________________________________________________


In [39]:
model.fit(train_set, train_labels, epochs=4, batch_size=256)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f6d282227b8>

In [40]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [4.16491174697876, 0.17395879328250885]


batch size = 256:  
test loss, test acc: [4.276370048522949, 0.17660073935985565] => 128 epochs  
test loss, test acc: [4.16491174697876, 0.17395879328250885] => 4 epochs

We can see that changing epoch count doesn't change the results that much and we should try to explore different models / architectures.

### LSTM