# Replication of attack on VoIP end-to-end encrypted messengers

## Models

### Loading and preprocessing

We will now try to explore various models on `Whatsapp` dataset. Bellow we will find loading and preprocessing that we have come up with in the analysis section.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

sns.set()  # make plots nicer

np.random.seed(42)  # set seed for reproducibility

In [2]:
def file_parser_with_prev_next(path):
    file = open(path, 'r')
    lines = file.readlines()
    
    file_name = [path.split('/')[-1]]
    sentence = ""
    file_data = []
    
    has_value = False
    previous = 0
    
    for line in lines:
        line = line.strip()
        
        # if there are only 2 informations on line and second is h#, then ignore
        # strip line, split primarly on ; secondary on ,
        if (line.startswith('#')):
            if (not sentence):
                sentence = line[len('# Sentence: "'): len(line) - 1]
            continue
        
        line = line.split(';')
        
        if (len(line) == 1):
            #lines containing only their packet size and nothing else, they should be added
            #TODO
            line += [""]
            line += [""]
            #continue
        
        if (len(line) == 2):
            #this tries to remove most of the silence at the start of the recording
            #potentionally harmfull as we shouldn't clean test data this way (we will be reading labels)
            #if (line[1] == 'h#'):
            #    continue
            line += [""]
        
        line[1] = tuple(line[1].split(','))
        line[2] = tuple(list(map(lambda a: a.strip('"'), line[2].split(','))))
        
        if (has_value):
            file_data[-1][4] = line[0]
           
        # file_type and sentence contain duplicate informations, but are kept for readability
        line = file_name + [file_name[0][0:9]] + [sentence] + [previous] + [0] + line
        #adding previous as feature
        previous = line[5]
        file_data += [line]
        
        #adding next frame as feature
        has_value = True
        
        

    return pd.DataFrame(file_data, columns=['file', 'speaker', 'sentence', 'previous_packet', 'next_packet','packet_size', 'phonemes', 'words'])

def load_files_with_prev_next(directory):
    filelist = os.listdir(directory)
    #read them into pandas
    df_list = [file_parser_with_prev_next(directory+file) for file in filelist]
    #concatenate them together
    return pd.concat(df_list, ignore_index=True)

In [3]:
def convert_types(data_frame):
    data_frame['packet_size'] = pd.to_numeric(data_frame['packet_size'])
    data_frame['previous_packet'] = pd.to_numeric(data_frame['previous_packet'])
    data_frame['next_packet'] = pd.to_numeric(data_frame['next_packet'])

    data_frame['file'] = data_frame['file'].astype('category')
    data_frame['sentence'] = data_frame['sentence'].astype('category')
    data_frame['speaker'] = data_frame['speaker'].astype('category')

In [4]:
whatsapp_data_train = load_files_with_prev_next("./../data/whatsapp_train_data/")
whatsapp_data_test = load_files_with_prev_next("./../data/whatsapp_test_data/")
convert_types(whatsapp_data_train)
convert_types(whatsapp_data_test)
whatsapp_data_test

Unnamed: 0,file,speaker,sentence,previous_packet,next_packet,packet_size,phonemes,words
0,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,0,342,249,"(h#,)","(,)"
1,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,249,335,342,"(h#,)","(,)"
2,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,342,303,335,"(h#,)","(,)"
3,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,335,364,303,"(h#, sh)","(she,)"
4,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,303,418,364,"(sh, iy, hv)","(she, had)"
...,...,...,...,...,...,...,...,...
31584,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,338,370,303,"(r, ao, r)","(our, oriental)"
31585,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,303,314,370,"(r, iy, eh)","(oriental,)"
31586,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,370,303,314,"(eh, n, tcl, t)","(oriental,)"
31587,DR8-MSLB0-SX383.CSV,DR8-MSLB0,The carpet cleaners shampooed our oriental rug.,314,295,303,"(t, el, r, ah)","(oriental, rug)"


In [5]:
def add_surrounding(data_frame):
    data_frame['prev_curr'] = list(zip(data_frame.previous_packet, data_frame.packet_size))
    data_frame['next_curr'] = list(zip(data_frame.next_packet, data_frame.packet_size))
    data_frame['packet_surrounding'] = list(zip(data_frame.previous_packet, data_frame.packet_size, data_frame.next_packet))
    
    #data_frame['prev_curr'] = data_frame['prev_curr'].astype('category')
    #data_frame['next_curr'] = data_frame['next_curr'].astype('category')
    #data_frame['packet_surrounding'] = data_frame['packet_surrounding'].astype('category')

add_surrounding(whatsapp_data_train)
add_surrounding(whatsapp_data_test)

whatsapp_data_train = whatsapp_data_train[['file', 'speaker', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
whatsapp_data_test = whatsapp_data_test[['file', 'speaker', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
whatsapp_data_train

Unnamed: 0,file,speaker,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,0,380,227,"(0, 227)","(380, 227)","(0, 227, 380)","(h#,)","(,)"
1,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,227,407,380,"(227, 380)","(407, 380)","(227, 380, 407)","(h#, sh, ix)","(she,)"
2,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,380,350,407,"(380, 407)","(350, 407)","(380, 407, 350)","(ix, hv, eh)","(she, had)"
3,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,407,281,350,"(407, 350)","(281, 350)","(407, 350, 281)","(eh, dcl, jh)","(had, your)"
4,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,350,327,281,"(350, 281)","(327, 281)","(350, 281, 327)","(jh, ih, dcl, d, ah)","(had, your, dark)"
...,...,...,...,...,...,...,...,...,...,...,...
86492,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,286,253,268,"(286, 268)","(253, 268)","(286, 268, 253)","(ay, bcl, b, ih)","(by, big)"
86493,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,268,315,253,"(268, 253)","(315, 253)","(268, 253, 315)","(ih, gcl)","(big,)"
86494,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,253,279,315,"(253, 315)","(279, 315)","(253, 315, 279)","(gcl, t, ih)","(big, tips)"
86495,DR8-MTCS0-SX82.CSV,DR8-MTCS0,Good service should be rewarded by big tips.,315,392,279,"(315, 279)","(392, 279)","(315, 279, 392)","(ih, pcl, p)","(tips,)"


### Preparing data

In [6]:
# add removal of labels for the test_dataset
def get_labels(df, label=["words"], feature=["previous_packet", "packet_size", "next_packet"]):
    labels = df[label]
    features = df[feature]
    return features, labels

In [7]:
from sklearn.preprocessing import LabelEncoder

def prepare_labels(train_labels, test_labels, label=["words"]):
    train_labels = train_labels.astype('category')
    test_labels = test_labels.astype('category')
    
    total_labels = train_labels.append(test_labels)
    
    lab_enc = LabelEncoder()
    lab_enc.fit(total_labels[label])

    train_labels = lab_enc.transform(train_labels[label])
    test_labels = lab_enc.transform(test_labels[label])
    
    return train_labels, test_labels, lab_enc

### Tree classifier

First model that we will be trying is tree classifier. In the analysis we have noticed, that there is almost a 1:1 correspondence of trigram of phoneme sizes and words (eg. that for every trigram of phoneme sizes there is different word). 

add something how it might lead to something
or about that if we have everything from the same user that then we decipher his voice recordings really easily

In [8]:
train_set, train_labels = get_labels(whatsapp_data_train)
test_set, test_labels = get_labels(whatsapp_data_test)

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

In [9]:
total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.words)))
print(len(pd.unique(test_labels.words)))
len(pd.unique(total_labels.words))

16168
6739


21317

Now we can see that we have a really big problem => there are 5149 new words that we have never seen. As we saw in our analysis we can't really generalise on never seen words before.

In [10]:
from sklearn.preprocessing import LabelEncoder

lab_enc = LabelEncoder()
lab_enc.fit(total_labels.words)

train_labels = lab_enc.transform(train_labels.words)
test_labels = lab_enc.transform(test_labels.words)
train_labels

array([    0, 15548, 15578, ...,  2531, 18676, 18676])

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

tree_clf_pipeline = Pipeline(
    [
        (
            "clf",
            DecisionTreeClassifier(criterion="entropy", max_depth=14, splitter="best",
                                   min_samples_split=2, random_state=42),
        ),
    ]
)
# Words: criterion="entropy", max_depth=None, splitter="best", min_samples_split=2, random_state=42 => 0.97, 0.02
# 

In [12]:
tree_clf_pipeline.fit(train_set, train_labels)

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.2631
Test accuracy : 0.0380


Phonemes

In [13]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["phonemes"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["phonemes"])

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

In [14]:
total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.phonemes)))
print(len(pd.unique(test_labels.phonemes)))
len(pd.unique(total_labels.phonemes))

27369
12655


33990

In [15]:
lab_enc = LabelEncoder()
lab_enc.fit(total_labels.phonemes)

train_labels = lab_enc.transform(train_labels.phonemes)
test_labels = lab_enc.transform(test_labels.phonemes)
train_labels

array([12858, 13445, 15553, ..., 12837, 14816, 24059])

In [16]:
tree_clf_pipeline.fit(train_set, train_labels)

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.2454
Test accuracy : 0.0293


All and all we can see, that tree classifier was really underperforming. It needed too much RAM and didn't provide any meaningfull results.

### KNN

Let's take a look a different kind of classifier => k nearest neighbours. This classifier shouldn't need that much RAM and that much of a computational power.

In [17]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(**kwargs)


In [18]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(64, weights='distance', n_jobs=4)
        ),
    ]
)

#5   => 0.0313
#10  => 0.0377
#20  => 0.0450
#32  => 0.0497
#64  => 0.0567
#128 => 0.0625
#256 => 0.0668

# uniform gives better test results but doesn't seem to be able to "answer correctly" on the train test
# 64 => 0.0927, 0.0685

aha


In [19]:
knn_clf_pipeline.fit(train_set, train_labels)

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.9723
Test accuracy : 0.0567


For the search space of 64 nearest neighbours we get only 5.67% success rate on our test data (which is around 1971 words). I have listed other parameters and their resulting percentages in the comments in the code cell. 

This whole section is just made as sanity check that we actually get expected results (that is we only guess the words we've already seen and none from which we haven't seen).

In [20]:
data_test_copy = whatsapp_data_test.copy()

column_select = list(map(lambda x: x in list(whatsapp_data_train.words.drop_duplicates()), list(data_test_copy.words)))

print("Known words:\t", column_select.count(True))
print("Unknown words:\t", column_select.count(False))

16405
15184


In [21]:
data_test_copy = data_test_copy[column_select]

In [22]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(data_test_copy, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(**kwargs)


In [23]:
# knn_clf_pipeline.fit(train_set, train_labels)

# 256, distance => 0.9723, 0.1286 on only 

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.0516
Test accuracy : 0.0622


From this we get that the succes rate on known is around double the ammount on all words (this can be seen from the output of a cell 2 cells above).

In [25]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test[list(map(lambda x: not x, column_select))], label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(**kwargs)


In [26]:
#knn_clf_pipeline.fit(train_set, train_labels)

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.9723
Test accuracy : 0.0000


This test was only made as "sanity check" as it is indeed highly probable that our model wouldn't be able to properly guess on never seen examples of words.

Now let's try our luck with phonemes:

In [27]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["phonemes"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(**kwargs)


In [28]:
knn_clf_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(64, weights='distance', n_jobs=4)
        ),
    ]
)
# 5 => 0.0256
# 6 => 0.0269
# 10 => 0.0299
# 20 => 0.0343
# 32 => 0.0368
# 64 => 0.0404
# 128 => 0.0441
# 256 => 0.0465

In [29]:
knn_clf_pipeline.fit(train_set, train_labels)

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Train accuracy: 0.9723
Test accuracy : 0.0404


We can clearly see that phonemes didn't help us that much and that the results are far worse from those gotten by exploring words.

### SVC

In [None]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

In [None]:
from sklearn.svm import SVC

svc_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(64, weights='distance', n_jobs=4)
        ),
    ]
)


In [None]:
svc_pipeline.fit(train_set, train_labels)

print(f"Train accuracy: {svc_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {svc_pipeline.score(test_set, test_labels):.4f}")

### AdaBoost Classifier

In [None]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

abc_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(64, weights='distance', n_jobs=4)
        ),
    ]
)


In [None]:
abc_pipeline.fit(train_set, train_labels)

print(f"Train accuracy: {abc_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {abc_pipeline.score(test_set, test_labels):.4f}")

### MLP Classifier

In [None]:
train_set, train_labels = get_labels(whatsapp_data_train, label=["words"])
test_set, test_labels = get_labels(whatsapp_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_clf_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(64, weights='distance', n_jobs=4)
        ),
    ]
)


In [None]:
mlp_clf_pipeline.fit(train_set, train_labels)

print(f"Train accuracy: {mlp_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {mlp_clf_pipeline.score(test_set, test_labels):.4f}")