# Replication of attack on VoIP end-to-end encrypted messengers

## Models

### Loading and preprocessing

We will now try to explore various models on `skype` dataset. Bellow we will find loading and preprocessing that we have come up with in the analysis section.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import GridSearchCV

sns.set()  # make plots nicer

np.random.seed(42)  # set seed for reproducibility

In [2]:
def file_parser_with_prev_next(path):
    file = open(path, 'r')
    lines = file.readlines()
    
    file_name = [path.split('/')[-1]]
    sentence = ""
    file_data = []
    
    has_value = False
    previous = 0
    
    for line in lines:
        line = line.strip()
        
        # if there are only 2 informations on line and second is h#, then ignore
        # strip line, split primarly on ; secondary on ,
        if (line.startswith('#')):
            if (not sentence):
                sentence = line[len('# Sentence: "'): len(line) - 1]
            continue
        
        line = line.split(';')
        
        if (len(line) == 1):
            #lines containing only their packet size and nothing else, they should be added
            #TODO
            line += [""]
            line += [""]
            #continue
        
        if (len(line) == 2):
            #this tries to remove most of the silence at the start of the recording
            #potentionally harmfull as we shouldn't clean test data this way (we will be reading labels)
            #if (line[1] == 'h#'):
            #    continue
            line += [""]
        
        line[1] = tuple(line[1].split(','))
        line[2] = tuple(list(map(lambda a: a.strip('"'), line[2].split(','))))
        
        if (has_value):
            file_data[-1][-4] = line[0]
           
        # file_type and sentence contain duplicate informations, but are kept for readability
        split_filename = file_name[0].split('-')
        
        line = file_name + [split_filename[0]] + [split_filename[1]] + [split_filename[2][0:-4]] + [sentence] + [previous] + [0] + line
        #adding previous as feature
        previous = line[-3]
        file_data += [line]
        
        #adding next frame as feature
        has_value = True
        
    return pd.DataFrame(file_data, columns=['file', 'dialect', 'speaker', 'sentence_id', 'sentence', 'previous_packet', 'next_packet','packet_size', 'phonemes', 'words'])

def load_files_with_prev_next(directory):
    filelist = os.listdir(directory)
    #read them into pandas
    df_list = [file_parser_with_prev_next(directory+file) for file in filelist]
    #concatenate them together
    return pd.concat(df_list, ignore_index=True)

def convert_types(data_frame):
    data_frame['packet_size'] = pd.to_numeric(data_frame['packet_size'])
    data_frame['previous_packet'] = pd.to_numeric(data_frame['previous_packet'])
    data_frame['next_packet'] = pd.to_numeric(data_frame['next_packet'])

    data_frame['file'] = data_frame['file'].astype('category')
    data_frame['sentence'] = data_frame['sentence'].astype('category')
    
    data_frame['dialect'] = data_frame['dialect'].astype('category')
    data_frame['speaker'] = data_frame['speaker'].astype('category')
    data_frame['sentence_id'] = data_frame['sentence_id'].astype('category')

In [3]:
skype_data_train = load_files_with_prev_next("./../data/skype_train_data/")
skype_data_test = load_files_with_prev_next("./../data/skype_test_data/")
convert_types(skype_data_train)
convert_types(skype_data_test)
skype_data_test

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,phonemes,words
0,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,0,35,30,"(h#,)","(,)"
1,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,30,43,35,"(h#,)","(,)"
2,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,35,26,43,"(h#,)","(,)"
3,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,43,30,26,"(h#,)","(,)"
4,DR1-FAKS0-SA1.CSV,DR1,FAKS0,SA1,She had your dark suit in greasy wash water al...,26,31,30,"(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...
258516,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,40,43,46,"(h#,)","(,)"
258517,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,46,41,43,"(h#,)","(,)"
258518,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,43,34,41,"(h#,)","(,)"
258519,DR8-MSLB0-SX383.CSV,DR8,MSLB0,SX383,The carpet cleaners shampooed our oriental rug.,41,33,34,"(h#,)","(,)"


In [4]:
def add_surrounding(data_frame):
    data_frame['prev_curr'] = list(zip(data_frame.previous_packet, data_frame.packet_size))
    data_frame['next_curr'] = list(zip(data_frame.next_packet, data_frame.packet_size))
    data_frame['packet_surrounding'] = list(zip(data_frame.previous_packet, data_frame.packet_size, data_frame.next_packet))
    
    #data_frame['prev_curr'] = data_frame['prev_curr'].astype('category')
    #data_frame['next_curr'] = data_frame['next_curr'].astype('category')
    #data_frame['packet_surrounding'] = data_frame['packet_surrounding'].astype('category')

add_surrounding(skype_data_train)
add_surrounding(skype_data_test)

skype_data_train = skype_data_train[['file', 'dialect', 'speaker', 'sentence_id', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
skype_data_test = skype_data_test[['file', 'dialect', 'speaker', 'sentence_id', 'sentence', 'previous_packet', 'next_packet','packet_size', 'prev_curr', 'next_curr', 'packet_surrounding', 'phonemes', 'words']]
skype_data_train

Unnamed: 0,file,dialect,speaker,sentence_id,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,0,32,32,"(0, 32)","(32, 32)","(0, 32, 32)","(h#,)","(,)"
1,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,31,32,"(32, 32)","(31, 32)","(32, 32, 31)","(h#,)","(,)"
2,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,32,28,31,"(32, 31)","(28, 31)","(32, 31, 28)","(h#,)","(,)"
3,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,31,28,28,"(31, 28)","(28, 28)","(31, 28, 28)","(h#,)","(,)"
4,DR1-FCJF0-SA1.CSV,DR1,FCJF0,SA1,She had your dark suit in greasy wash water al...,28,36,28,"(28, 28)","(36, 28)","(28, 28, 36)","(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
707433,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,47,34,32,"(47, 32)","(34, 32)","(47, 32, 34)","(h#,)","(,)"
707434,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,32,39,34,"(32, 34)","(39, 34)","(32, 34, 39)","(h#,)","(,)"
707435,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,34,33,39,"(34, 39)","(33, 39)","(34, 39, 33)","(h#,)","(,)"
707436,DR8-MTCS0-SX82.CSV,DR8,MTCS0,SX82,Good service should be rewarded by big tips.,39,36,33,"(39, 33)","(36, 33)","(39, 33, 36)","(h#,)","(,)"


### Preparing data

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [6]:
# add removal of labels for the test_dataset
def get_labels(df, label=["words"], feature=["previous_packet", "packet_size", "next_packet"]):
    labels = df.loc[:, label]
    features = df.loc[:, feature]
    return features, labels

In [7]:
def prepare_labels(train_labels, test_labels, label=["words"]):
    train_labels = train_labels.astype('category')
    test_labels = test_labels.astype('category')
    
    total_labels = train_labels.append(test_labels)
    
    lab_enc = LabelEncoder()
    lab_enc.fit(total_labels[label])

    train_labels = lab_enc.transform(train_labels[label])
    test_labels = lab_enc.transform(test_labels[label])
    
    return train_labels, test_labels, lab_enc

In [8]:
train_set, train_labels = get_labels(skype_data_train)
test_set, test_labels = get_labels(skype_data_test)

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.words)))
print(len(pd.unique(test_labels.words)))
total_unique_words = len(pd.unique(total_labels.words))
total_unique_words

15713
6626


20568

Now we can see that we have a really big problem => there are 4855 new words that we have never seen. As we saw in our analysis we can't really generalise on never seen words before => this will hinder our results

In [9]:
train_set, train_labels = get_labels(skype_data_train, label=['phonemes'])
test_set, test_labels = get_labels(skype_data_test, label=['phonemes'])

train_labels = train_labels.astype('category')
test_labels = test_labels.astype('category')

total_labels = train_labels.append(test_labels)
print(len(pd.unique(train_labels.phonemes)))
print(len(pd.unique(test_labels.phonemes)))
total_unique_phonemes = len(pd.unique(total_labels.phonemes))
total_unique_phonemes

3083
2363


3281

But this is really promissing. There are only 198 new phonemes, that we haven't seen.

In [10]:
def cv_dialect_splitter():
    for dialect in np.unique(skype_data_train.dialect):
        yield (skype_data_train.index[skype_data_train["dialect"] != dialect],
               skype_data_train.index[skype_data_train["dialect"] == dialect])

### Tree classifier

First model that we will be trying is tree classifier. 

In [11]:
from sklearn.tree import DecisionTreeClassifier

In [12]:
train_set, train_labels = get_labels(skype_data_train, label=["words"])
test_set, test_labels = get_labels(skype_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [20]:
parameters = {'criterion':['gini', 'entropy'], 'max_depth':[12,None], 'splitter':['best'],
              'min_samples_split':[2], 'random_state':[42]}

In [21]:
orig_clf = DecisionTreeClassifier()
gscv_clf = GridSearchCV(orig_clf, parameters, n_jobs = -1, cv=cv_dialect_splitter())

In [22]:
print("Starting!")
gscv_clf.fit(train_set, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set, test_labels):.4f}")

Starting!
Finished!
Best: 0.143448 using {'criterion': 'gini', 'max_depth': 12, 'min_samples_split': 2, 'random_state': 42, 'splitter': 'best'}
Train accuracy: 0.1515
Test accuracy : 0.1411


In [23]:
tree_clf_pipeline = Pipeline(
    [
        (
            "clf",
            DecisionTreeClassifier(criterion="entropy", max_depth=None, splitter="best",
                                   min_samples_split=2, random_state=42),
        ),
    ]
)

In [24]:
print("Starting!")
tree_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.2925
Test accuracy : 0.1195


Phonemes

In [14]:
train_set, train_labels = get_labels(skype_data_train, label=["phonemes"])
test_set, test_labels = get_labels(skype_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

In [24]:
orig_clf = DecisionTreeClassifier()
gscv_clf = GridSearchCV(orig_clf, parameters, n_jobs = -1, cv=cv_dialect_splitter())

In [25]:
print("Starting!")
gscv_clf.fit(train_set, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set, test_labels):.4f}")

Starting!
Finished!
Best: 0.143448 using {'criterion': 'gini', 'max_depth': 12, 'min_samples_split': 2, 'random_state': 42, 'splitter': 'best'}
Train accuracy: 0.1515
Test accuracy : 0.1411


In [25]:
tree_clf_pipeline = Pipeline(
    [
        (
            "clf",
            DecisionTreeClassifier(criterion="entropy", max_depth=None, splitter="best",
                                   min_samples_split=2, random_state=42),
        ),
    ]
)

In [26]:
print("Starting!")
tree_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.2925
Test accuracy : 0.1195


This is already quite a promissing success rate for "just a simple" tree classifier. Also as we can see, `phonemes` give better results in skype dataset.

### KNN

Let's take a look a different kind of classifier => k nearest neighbours. This classifier shouldn't need that much RAM and that much of a computational power.

In [27]:
from sklearn.neighbors import KNeighborsClassifier

In [28]:
train_set, train_labels = get_labels(skype_data_train, label=["words"])
test_set, test_labels = get_labels(skype_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [32]:
parameters = {'n_neighbors':[16,32], 'weights':['uniform', 'distance'], 'n_jobs':[-1]}

In [33]:
orig_clf = KNeighborsClassifier()
gscv_clf = GridSearchCV(orig_clf, parameters, n_jobs = -1, cv=cv_dialect_splitter())

In [34]:
print("Starting!")
gscv_clf.fit(train_set, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set, test_labels):.4f}")

Starting!
Finished!
Best: 0.136635 using {'n_jobs': -1, 'n_neighbors': 32, 'weights': 'uniform'}
Train accuracy: 0.1700
Test accuracy : 0.1343


In [42]:
knn_clf_pipeline = Pipeline(
    [
        (
            "scaler",
            StandardScaler()
        ),
        (
            "clf",
            KNeighborsClassifier(32, weights='distance', n_jobs=-1)
        ),
    ]
)

# 20, distance => 0.2887, 0.1203
# 32, uniform => 0.1700, 0.1343
# 32, distance => 0.2912, 0.1216

In [43]:
print("Starting!")
knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.2911
Test accuracy : 0.1213


For the search space of 32 nearest neighbours we get around 12% success rate on our test data (which is around 31436 words). I have listed other parameters and their resulting percentages in the comments in the code cell. Also worth noting is that "StandardScaler" only worsens (not tested on skype) our predictions.

Now let's try our luck with phonemes:

In [24]:
train_set, train_labels = get_labels(skype_data_train, label=["phonemes"])
test_set, test_labels = get_labels(skype_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [32]:
parameters = {'n_neighbors':[16,32], 'weights':['uniform', 'distance'], 'n_jobs':[-1]}

In [33]:
orig_clf = KNeighborsClassifier()
gscv_clf = GridSearchCV(orig_clf, parameters, n_jobs = -1, cv=cv_dialect_splitter())

In [34]:
print("Starting!")
gscv_clf.fit(train_set, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set, test_labels):.4f}")

Starting!
Finished!
Best: 0.136635 using {'n_jobs': -1, 'n_neighbors': 32, 'weights': 'uniform'}
Train accuracy: 0.1700
Test accuracy : 0.1343


In [25]:
knn_clf_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(32, weights='distance', n_jobs=-1)
        ),
    ]
)
# 20, distance => 0.3221, 0.1377
# 32, uniform => 0.2093, 0.1574
# 32, distance => 0.3265, 0.1410

In [26]:
print("Starting!")
knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.3265
Test accuracy : 0.1410


We can clearly see that phonemes are indeed actually better than just words and help us get better predictions. But of course there is also adds the complication of how to make words from these phonemes / make something, that makes sense.

### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
train_set, train_labels = get_labels(skype_data_train, label=["phonemes"])
test_set, test_labels = get_labels(skype_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [28]:
rfc_pipeline = Pipeline(
    [
        (
            "clf",
            RandomForestClassifier(max_depth=12, random_state=42, criterion = 'entropy', n_jobs = -1, min_samples_split = 2)
        ),
    ]
)

In [None]:
print("Starting!")
rfc_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {rfc_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {rfc_pipeline.score(test_set, test_labels):.4f}")

RAM :(

### AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [9]:
train_set, train_labels = get_labels(skype_data_train, label=["words"])
test_set, test_labels = get_labels(skype_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [10]:
abc_pipeline = Pipeline(
    [
        (
            "clf",
            AdaBoostClassifier(random_state=1, n_estimators = 60, learning_rate=0.9)
        ),
    ]
)


In [1]:
print("Starting!")
#abc_pipeline.fit(train_set, train_labels)
print("Finished!")

#print(f"Train accuracy: {abc_pipeline.score(train_set, train_labels):.4f}")
#print(f"Test accuracy : {abc_pipeline.score(test_set, test_labels):.4f}")

print("0.0505, 0.0471")

0.0505, 0.0471


This classifier ended in absolute failure as it wasn't able to get even acceptable results on the train data. And it even took 8 hours to learn (this is because it can only use 1 thread), so this classifier is pretty much worthless to us.

### MLP Classifier

Now let's now try to bring out the big guns - neural networks. For this I've chosen to use TensorFlow and Keras (PyTorch could also be used). We are able to get reasonably better results but at the cost of long compute times.

In [11]:
import keras
import tensorflow as tf

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import classification_report

from keras.wrappers.scikit_learn import KerasClassifier

In [12]:
train_set, train_labels = get_labels(skype_data_train, label=["words"])
test_set, test_labels = get_labels(skype_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [13]:
print(train_set.shape)
print(train_labels.shape)

(707438, 3)
(707438,)


In [14]:
train_labels = to_categorical(train_labels, num_classes=total_unique_words)
test_labels = to_categorical(test_labels, num_classes=total_unique_words)
print(train_labels.shape)

(707438, 20568)


In [15]:
from keras.wrappers.scikit_learn import KerasClassifier

def build_clf(optimizer='adam', classes=total_unique_words):
    model = Sequential()

    model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
    model.add(Dense(units=256, activation='relu'))  # second hidden layer
    # model.add(Dense(units=128, activation='relu'))  # third hidden layer
    model.add(Dense(units=classes, activation='softmax'))  # output layer
    # model.add(Dense(units=128))  # output layer

    model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
    
    return model

In [16]:
parameters = {'batch_size':[64,128,256],
          'nb_epoch':[16,32],
          'optimizer':['adam', 'rmsprop']}

In [17]:
orig_clf = KerasClassifier(build_fn=build_clf)
gscv_clf = GridSearchCV(orig_clf, parameters, cv=cv_dialect_splitter())

In [None]:
print("Starting!")
gscv_clf.fit(train_set, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set, test_labels):.4f}")

Starting!


In [15]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_words, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 20568)             5285976   
Total params: 5,419,352
Trainable params: 5,419,352
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.fit(train_set, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7f0f97b186a0>

In [17]:
print("train loss, train acc:", model.evaluate(train_set, train_labels))

train loss, train acc: [6.281101703643799, 0.1451999992132187]


In [18]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [10.701821327209473, 0.14304447174072266]


batch size = 128:  
test loss, test acc: [12.597193717956543, 0.1413850337266922] => 128 epochs  

batch size = 256:  
test loss, test acc: [9.740732192993164, 0.14129993319511414] => 4 epochs

We can see that changing epoch count doesn't change the results that much and we should try to explore different models / architectures.

#### Phonemes

In [19]:
train_set, train_labels = get_labels(skype_data_train, label=["phonemes"])
test_set, test_labels = get_labels(skype_data_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [20]:
print(train_set.shape)
print(train_labels.shape)

(707438, 3)
(707438,)


In [21]:
train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(707438, 3281)


In [15]:
from keras.wrappers.scikit_learn import KerasClassifier

def build_clf(optimizer='adam', classes=total_unique_phonemes):
    model = Sequential()

    model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
    model.add(Dense(units=256, activation='relu'))  # second hidden layer
    # model.add(Dense(units=128, activation='relu'))  # third hidden layer
    model.add(Dense(units=classes, activation='softmax'))  # output layer
    # model.add(Dense(units=128))  # output layer

    model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
    
    return model

In [16]:
parameters = {'batch_size':[64,128,256],
          'nb_epoch':[16,32],
          'optimizer':['adam', 'rmsprop']}

In [20]:
orig_clf = KerasClassifier(build_fn=build_clf)
gscv_clf = GridSearchCV(orig_clf, parameters, cv=cv_dialect_splitter())

In [22]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_phonemes, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 512)               2048      
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_5 (Dense)              (None, 3281)              843217    
Total params: 976,593
Trainable params: 976,593
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.fit(train_set, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7f0f97c75fd0>

In [24]:
print("train loss, train acc:", model.evaluate(train_set, train_labels))

train loss, train acc: [4.020120620727539, 0.18351855874061584]


In [25]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [4.224584579467773, 0.17687150835990906]


batch size = 256:  
test loss, test acc: [4.276370048522949, 0.17660073935985565] => 128 epochs  
test loss, test acc: [4.16491174697876, 0.17395879328250885] => 4 epochs

We can see that changing epoch count doesn't change the results that much and we should try to explore different models / architectures.

### LSTM

In [12]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [13]:
# model_lstm = Sequential()
# model_lstm.add(LSTM(256, input_shape = (1, 3)))
# model_lstm.add(Dense(units=total_unique_words))
# model_lstm.compile(loss='categorical_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy']
#              )

# model.summary()

In [14]:
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import SpatialDropout1D

In [16]:
train_set, train_labels = get_labels(skype_data_train, label=["words"])
test_set, test_labels = get_labels(skype_data_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [17]:
from keras.utils import to_categorical

train_labels = to_categorical(train_labels, num_classes=total_unique_words)
test_labels = to_categorical(test_labels, num_classes=total_unique_words)
print(train_labels.shape)

(707438, 20568)


In [18]:
reshaped_values = train_set.values.reshape(-1, 1, 3)
reshaped_values[0][0]

array([ 0, 32, 32])

In [15]:
from keras.wrappers.scikit_learn import KerasClassifier

def build_lstm_clf(optimizer='adam', classes=total_unique_words):
    model = Sequential()

    model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
    model_lstm.add(Dense(256, activation = 'relu'))
    model_lstm.add(Dropout(0.3))
    model_lstm.add(Dense(classes, activation = 'softmax'))

    model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
    
    return model

In [16]:
parameters = {'batch_size':[64,128,256],
          'nb_epoch':[16,32],
          'optimizer':['adam', 'rmsprop']}

In [20]:
orig_clf = KerasClassifier(build_fn=build_clf)
gscv_clf = GridSearchCV(orig_clf, parameters, cv=cv_dialect_splitter())

In [21]:
print("Starting!")
gscv_clf.fit(reshaped_values, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(reshaped_values, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set.values.reshape(-1, 1, 3), test_labels):.4f}")

Starting!
Finished!
Best: 0.051422 using {'batch_size': 64, 'nb_epoch': 1, 'optimizer': 'adam'}
Train accuracy: 0.0506
Test accuracy : 0.0465


In [15]:
#more elaborate model
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_words, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256)               266240    
_________________________________________________________________
dense (Dense)                (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 20568)             5285976   
Total params: 5,618,008
Trainable params: 5,618,008
Non-trainable params: 0
_________________________________________________________________


In [19]:
model_lstm.fit(reshaped_values, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7fed7f1cf7b8>

In [20]:
print("train loss, train acc:", model_lstm.evaluate(reshaped_values, train_labels))

train loss, train acc: [8.382338523864746, 0.13532210886478424]


In [21]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 3), test_labels))

test loss, test acc: [11.390458106994629, 0.13362550735473633]


#### Phonemes:

In [22]:
from keras.utils import to_categorical

train_set, train_labels = get_labels(skype_data_train, label=['phonemes'])
test_set, test_labels = get_labels(skype_data_test, label=['phonemes'])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=['phonemes'])

train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes)
print(train_labels.shape)

(707438, 3281)


In [23]:
reshaped_values = train_set.values.reshape(-1, 1, 3)
reshaped_values[0][0]

array([ 0, 32, 32])

In [15]:
from keras.wrappers.scikit_learn import KerasClassifier

def build_lstm_clf(optimizer='adam', classes=total_unique_phonemes):
    model = Sequential()

    model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
    model_lstm.add(Dense(256, activation = 'relu'))
    model_lstm.add(Dropout(0.3))
    model_lstm.add(Dense(classes, activation = 'softmax'))

    model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
    
    return model

In [16]:
parameters = {'batch_size':[64,128,256],
          'nb_epoch':[16,32],
          'optimizer':['adam', 'rmsprop']}

In [20]:
orig_clf = KerasClassifier(build_fn=build_clf)
gscv_clf = GridSearchCV(orig_clf, parameters, cv=cv_dialect_splitter())

In [21]:
print("Starting!")
gscv_clf.fit(reshaped_values, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(reshaped_values, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set.values.reshape(-1, 1, 3), test_labels):.4f}")

Starting!
Finished!
Best: 0.051422 using {'batch_size': 64, 'nb_epoch': 1, 'optimizer': 'adam'}
Train accuracy: 0.0506
Test accuracy : 0.0465


In [21]:
#more elaborate model
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_phonemes, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 256)               266240    
_________________________________________________________________
dense_4 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 3281)              843217    
Total params: 1,175,249
Trainable params: 1,175,249
Non-trainable params: 0
_________________________________________________________________


In [24]:
model_lstm.fit(reshaped_values, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7f58b7e112b0>

In [25]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 3), test_labels))

test loss, test acc: [7.068105220794678, 0.118659608066082]


### Exploring the 2 most spoken sentences

In [11]:
sentence_1 = "She had your dark suit in greasy wash water all year."
sentence_2 = "Don't ask me to carry an oily rag like that."

In [12]:
two_sentence_train = skype_data_train.loc[skype_data_train["sentence"].isin([sentence_1, sentence_2])]
two_sentence_train.reset_index(inplace=True)
two_sentence_train

Unnamed: 0,index,file,speaker,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,0,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,0,32,32,"(0, 32)","(32, 32)","(0, 32, 32)","(h#,)","(,)"
1,1,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,32,31,32,"(32, 32)","(31, 32)","(32, 32, 31)","(h#,)","(,)"
2,2,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,32,28,31,"(32, 31)","(28, 31)","(32, 31, 28)","(h#,)","(,)"
3,3,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,31,28,28,"(31, 28)","(28, 28)","(31, 28, 28)","(h#,)","(,)"
4,4,DR1-FCJF0-SA1.CSV,DR1-FCJF0,She had your dark suit in greasy wash water al...,28,36,28,"(28, 28)","(36, 28)","(28, 28, 36)","(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...,...,...
142973,706238,DR8-MTCS0-SA2.CSV,DR8-MTCS0,Don't ask me to carry an oily rag like that.,30,47,27,"(30, 27)","(47, 27)","(30, 27, 47)","(tcl,)","(that,)"
142974,706239,DR8-MTCS0-SA2.CSV,DR8-MTCS0,Don't ask me to carry an oily rag like that.,27,49,47,"(27, 47)","(49, 47)","(27, 47, 49)","(tcl, h#)","(that,)"
142975,706240,DR8-MTCS0-SA2.CSV,DR8-MTCS0,Don't ask me to carry an oily rag like that.,47,40,49,"(47, 49)","(40, 49)","(47, 49, 40)","(h#,)","(,)"
142976,706241,DR8-MTCS0-SA2.CSV,DR8-MTCS0,Don't ask me to carry an oily rag like that.,49,50,40,"(49, 40)","(50, 40)","(49, 40, 50)","(h#,)","(,)"


In [13]:
two_sentence_test = skype_data_test.loc[skype_data_test["sentence"].isin([sentence_1, sentence_2])]
two_sentence_test.reset_index(inplace=True, drop=True)
two_sentence_test

Unnamed: 0,file,speaker,sentence,previous_packet,next_packet,packet_size,prev_curr,next_curr,packet_surrounding,phonemes,words
0,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,0,35,30,"(0, 30)","(35, 30)","(0, 30, 35)","(h#,)","(,)"
1,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,30,43,35,"(30, 35)","(43, 35)","(30, 35, 43)","(h#,)","(,)"
2,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,35,26,43,"(35, 43)","(26, 43)","(35, 43, 26)","(h#,)","(,)"
3,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,43,30,26,"(43, 26)","(30, 26)","(43, 26, 30)","(h#,)","(,)"
4,DR1-FAKS0-SA1.CSV,DR1-FAKS0,She had your dark suit in greasy wash water al...,26,31,30,"(26, 30)","(31, 30)","(26, 30, 31)","(h#,)","(,)"
...,...,...,...,...,...,...,...,...,...,...,...
52356,DR8-MSLB0-SA2.CSV,DR8-MSLB0,Don't ask me to carry an oily rag like that.,47,36,32,"(47, 32)","(36, 32)","(47, 32, 36)","(tcl,)","(that,)"
52357,DR8-MSLB0-SA2.CSV,DR8-MSLB0,Don't ask me to carry an oily rag like that.,32,27,36,"(32, 36)","(27, 36)","(32, 36, 27)","(tcl,)","(that,)"
52358,DR8-MSLB0-SA2.CSV,DR8-MSLB0,Don't ask me to carry an oily rag like that.,36,26,27,"(36, 27)","(26, 27)","(36, 27, 26)","(tcl, h#)","(that,)"
52359,DR8-MSLB0-SA2.CSV,DR8-MSLB0,Don't ask me to carry an oily rag like that.,27,24,26,"(27, 26)","(24, 26)","(27, 26, 24)","(h#,)","(,)"


In [14]:
import keras
import tensorflow as tf

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import classification_report

In [15]:
train_set, train_labels = get_labels(two_sentence_train, label=["words"])
test_set, test_labels = get_labels(two_sentence_test, label=["words"])

In [16]:
total_labels_2 = train_labels.append(test_labels)
print(len(pd.unique(train_labels.words)))
print(len(pd.unique(test_labels.words)))
total_unique_words_2 = len(pd.unique(total_labels_2.words))
total_unique_words_2

42
41


42

In [17]:
train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [18]:
train_labels = to_categorical(train_labels, num_classes=total_unique_words_2)
test_labels = to_categorical(test_labels, num_classes=total_unique_words_2)
print(train_labels.shape)

(142978, 42)


#### BE CAREFUL ABOUT TOTAL WORDS

In [15]:
from keras.wrappers.scikit_learn import KerasClassifier

def build_clf(optimizer='adam', classes=total_unique_words_2):
    model = Sequential()

    model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
    model.add(Dense(units=256, activation='relu'))  # second hidden layer
    # model.add(Dense(units=128, activation='relu'))  # third hidden layer
    model.add(Dense(units=classes, activation='softmax'))  # output layer
    # model.add(Dense(units=128))  # output layer

    model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
    
    return model

In [16]:
parameters = {'batch_size':[64,128,256],
          'nb_epoch':[16,32],
          'optimizer':['adam', 'rmsprop']}

In [20]:
orig_clf = KerasClassifier(build_fn=build_clf)
gscv_clf = GridSearchCV(orig_clf, parameters, cv=cv_dialect_splitter())

In [19]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_words_2, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 42)                10794     
Total params: 144,170
Trainable params: 144,170
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.fit(train_set, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7faf92699fd0>

In [21]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [2.7361292839050293, 0.21008002758026123]


#### Phonemes

In [22]:
train_set, train_labels = get_labels(two_sentence_train, label=["phonemes"])
test_set, test_labels = get_labels(two_sentence_test, label=["phonemes"])

In [23]:
total_labels_2 = train_labels.append(test_labels)
print(len(pd.unique(train_labels.phonemes)))
print(len(pd.unique(test_labels.phonemes)))
total_unique_phonemes_2 = len(pd.unique(total_labels_2.phonemes))
total_unique_phonemes_2

587
420


632

In [24]:
train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [25]:
train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes_2)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes_2)
print(train_labels.shape)

(142978, 632)


In [15]:
from keras.wrappers.scikit_learn import KerasClassifier

def build_clf(optimizer='adam', classes=total_unique_phonemes_2):
    model = Sequential()

    model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
    model.add(Dense(units=256, activation='relu'))  # second hidden layer
    # model.add(Dense(units=128, activation='relu'))  # third hidden layer
    model.add(Dense(units=classes, activation='softmax'))  # output layer
    # model.add(Dense(units=128))  # output layer

    model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
    
    return model

In [16]:
parameters = {'batch_size':[64,128,256],
          'nb_epoch':[16,32],
          'optimizer':['adam', 'rmsprop']}

In [20]:
orig_clf = KerasClassifier(build_fn=build_clf)
gscv_clf = GridSearchCV(orig_clf, parameters, cv=cv_dialect_splitter())

In [26]:
model = Sequential()

model.add(Dense(units=512, activation='relu', input_dim=3*1))  # first hidden layer
model.add(Dense(units=256, activation='relu'))  # second hidden layer
# model.add(Dense(units=128, activation='relu'))  # third hidden layer
model.add(Dense(units=total_unique_phonemes_2, activation='softmax'))  # output layer
# model.add(Dense(units=128))  # output layer

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 512)               2048      
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_5 (Dense)              (None, 632)               162424    
Total params: 295,800
Trainable params: 295,800
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.fit(train_set, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7faf927d8320>

In [28]:
print("train loss, train acc:", model.evaluate(train_set, train_labels))

train loss, train acc: [3.2719388008117676, 0.21976108849048615]


In [29]:
print("test loss, test acc:", model.evaluate(test_set, test_labels))

test loss, test acc: [3.459465980529785, 0.21346040070056915]


### LSTM

In [30]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [31]:
# model_lstm = Sequential()
# model_lstm.add(LSTM(256, input_shape = (1, 3)))
# model_lstm.add(Dense(units=total_unique_words))
# model_lstm.compile(loss='categorical_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy']
#              )

# model.summary()

In [32]:
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import SpatialDropout1D

In [33]:
train_set, train_labels = get_labels(two_sentence_train, label=["words"])
test_set, test_labels = get_labels(two_sentence_test, label=["words"])

In [34]:
total_labels_2 = train_labels.append(test_labels)
print(len(pd.unique(train_labels.words)))
print(len(pd.unique(test_labels.words)))
total_unique_words_2 = len(pd.unique(total_labels_2.words))
total_unique_words_2

42
41


42

In [35]:
train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

  return f(*args, **kwargs)


In [36]:
train_labels = to_categorical(train_labels, num_classes=total_unique_words_2)
test_labels = to_categorical(test_labels, num_classes=total_unique_words_2)
print(train_labels.shape)

(142978, 42)


In [37]:
reshaped_values = train_set.values.reshape(-1, 1, 3)
reshaped_values[0][0]

array([ 0, 32, 32])

In [15]:
from keras.wrappers.scikit_learn import KerasClassifier

def build_lstm_clf(optimizer='adam', classes=total_unique_words_2):
    model = Sequential()

    model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
    model_lstm.add(Dense(256, activation = 'relu'))
    model_lstm.add(Dropout(0.3))
    model_lstm.add(Dense(classes, activation = 'softmax'))

    model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
    
    return model

In [16]:
parameters = {'batch_size':[64,128,256],
          'nb_epoch':[16,32],
          'optimizer':['adam', 'rmsprop']}

In [20]:
orig_clf = KerasClassifier(build_fn=build_clf)
gscv_clf = GridSearchCV(orig_clf, parameters, cv=cv_dialect_splitter())

In [21]:
print("Starting!")
gscv_clf.fit(reshaped_values, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(reshaped_values, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set.values.reshape(-1, 1, 3), test_labels):.4f}")

Starting!
Finished!
Best: 0.051422 using {'batch_size': 64, 'nb_epoch': 1, 'optimizer': 'adam'}
Train accuracy: 0.0506
Test accuracy : 0.0465


In [38]:
#more elaborate model
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_words_2, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256)               266240    
_________________________________________________________________
dense_6 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 42)                10794     
Total params: 342,826
Trainable params: 342,826
Non-trainable params: 0
_________________________________________________________________


In [39]:
model_lstm.fit(reshaped_values, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7faf92a0df98>

In [40]:
print("train loss, train acc:", model_lstm.evaluate(reshaped_values, train_labels))

train loss, train acc: [3.7270686626434326, 0.13907034695148468]


In [41]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 3), test_labels))

test loss, test acc: [3.810607433319092, 0.13441301882266998]


#### Phonemes:

In [42]:
train_set, train_labels = get_labels(two_sentence_train, label=["phonemes"])
test_set, test_labels = get_labels(two_sentence_test, label=["phonemes"])

In [43]:
total_labels_2 = train_labels.append(test_labels)
print(len(pd.unique(train_labels.phonemes)))
print(len(pd.unique(test_labels.phonemes)))
total_unique_phonemes_2 = len(pd.unique(total_labels_2.phonemes))
total_unique_phonemes_2

587
420


632

In [44]:
train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

In [45]:
train_labels = to_categorical(train_labels, num_classes=total_unique_phonemes_2)
test_labels = to_categorical(test_labels, num_classes=total_unique_phonemes_2)
print(train_labels.shape)

(142978, 632)


In [46]:
reshaped_values = train_set.values.reshape(-1, 1, 3)
reshaped_values[0][0]

array([ 0, 32, 32])

In [15]:
from keras.wrappers.scikit_learn import KerasClassifier

def build_lstm_clf(optimizer='adam', classes=total_unique_phonemes_2):
    model = Sequential()

    model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
    model_lstm.add(Dense(256, activation = 'relu'))
    model_lstm.add(Dropout(0.3))
    model_lstm.add(Dense(classes, activation = 'softmax'))

    model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
    
    return model

In [16]:
parameters = {'batch_size':[64,128,256],
          'nb_epoch':[16,32],
          'optimizer':['adam', 'rmsprop']}

In [20]:
orig_clf = KerasClassifier(build_fn=build_clf)
gscv_clf = GridSearchCV(orig_clf, parameters, cv=cv_dialect_splitter())

In [21]:
print("Starting!")
gscv_clf.fit(reshaped_values, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(reshaped_values, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set.values.reshape(-1, 1, 3), test_labels):.4f}")

Starting!
Finished!
Best: 0.051422 using {'batch_size': 64, 'nb_epoch': 1, 'optimizer': 'adam'}
Train accuracy: 0.0506
Test accuracy : 0.0465


In [47]:
#more elaborate model
model_lstm = Sequential()

#model_lstm.add(Embedding(input_dim = 3, output_dim = 2, input_length = 86497))
#model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, input_shape = (1, 3), dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(total_unique_phonemes_2, activation = 'softmax'))

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

model_lstm.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               266240    
_________________________________________________________________
dense_8 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 632)               162424    
Total params: 494,456
Trainable params: 494,456
Non-trainable params: 0
_________________________________________________________________


In [48]:
model_lstm.fit(reshaped_values, train_labels, epochs=64, batch_size=256)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x7faf930c6ef0>

In [49]:
print("train loss, train acc:", model_lstm.evaluate(reshaped_values, train_labels))

train loss, train acc: [6.40390682220459, 0.13299249112606049]


In [50]:
print("test loss, test acc:", model_lstm.evaluate(test_set.values.reshape(-1, 1, 3), test_labels))

test loss, test acc: [6.620239734649658, 0.12522679567337036]


### Decisions Tree

In [51]:
from sklearn.tree import DecisionTreeClassifier

In [52]:
train_set, train_labels = get_labels(two_sentence_train, label=["words"])
test_set, test_labels = get_labels(two_sentence_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels)

In [13]:
parameters = {'criterion':['gini', 'entropy'], 'max_depth':[12, None], 'splitter':['best'],
              'min_samples_split':[2], 'random_state':[42]}

In [24]:
orig_clf = DecisionTreeClassifier()
gscv_clf = GridSearchCV(orig_clf, parameters, n_jobs = -1, cv=cv_dialect_splitter())

In [25]:
print("Starting!")
gscv_clf.fit(train_set, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set, test_labels):.4f}")

Starting!
Finished!
Best: 0.143448 using {'criterion': 'gini', 'max_depth': 12, 'min_samples_split': 2, 'random_state': 42, 'splitter': 'best'}
Train accuracy: 0.1515
Test accuracy : 0.1411


In [53]:
tree_clf_pipeline = Pipeline(
    [
        (
            "clf",
            DecisionTreeClassifier(criterion="entropy", max_depth=None, splitter="best",
                                   min_samples_split=2, random_state=42),
        ),
    ]
)

In [54]:
print("Starting!")
tree_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.5392
Test accuracy : 0.1512


Phonemes

In [55]:
train_set, train_labels = get_labels(two_sentence_train, label=["phonemes"])
test_set, test_labels = get_labels(two_sentence_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [13]:
parameters = {'criterion':['gini', 'entropy'], 'max_depth':[12, None], 'splitter':['best'],
              'min_samples_split':[2], 'random_state':[42]}

In [24]:
orig_clf = DecisionTreeClassifier()
gscv_clf = GridSearchCV(orig_clf, parameters, n_jobs = -1, cv=cv_dialect_splitter())

In [25]:
print("Starting!")
gscv_clf.fit(train_set, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set, test_labels):.4f}")

Starting!
Finished!
Best: 0.143448 using {'criterion': 'gini', 'max_depth': 12, 'min_samples_split': 2, 'random_state': 42, 'splitter': 'best'}
Train accuracy: 0.1515
Test accuracy : 0.1411


In [56]:
tree_clf_pipeline = Pipeline(
    [
        (
            "clf",
            DecisionTreeClassifier(criterion="entropy", max_depth=None, splitter="best",
                                   min_samples_split=2, random_state=42),
        ),
    ]
)

In [57]:
print("Starting!")
tree_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {tree_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {tree_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.5287
Test accuracy : 0.1410


### KNN

In [58]:
from sklearn.neighbors import KNeighborsClassifier

In [59]:
train_set, train_labels = get_labels(two_sentence_train, label=["words"])
test_set, test_labels = get_labels(two_sentence_test, label=["words"])

train_labels, test_labels, _ = prepare_labels(two_sentence_train, test_labels)

  return f(*args, **kwargs)


In [32]:
parameters = {'n_neighbors':[16,32], 'weights':['uniform', 'distance'], 'n_jobs':[-1]}

In [33]:
orig_clf = KNeighborsClassifier()
gscv_clf = GridSearchCV(orig_clf, parameters, n_jobs = -1, cv=cv_dialect_splitter())

In [34]:
print("Starting!")
gscv_clf.fit(train_set, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set, test_labels):.4f}")

Starting!
Finished!
Best: 0.136635 using {'n_jobs': -1, 'n_neighbors': 32, 'weights': 'uniform'}
Train accuracy: 0.1700
Test accuracy : 0.1343


In [60]:
knn_clf_pipeline = Pipeline(
    [
        (
            "scaler",
            StandardScaler()
        ),
        (
            "clf",
            KNeighborsClassifier(32, weights='distance', n_jobs=-1)
        ),
    ]
)

# 20, distance => 0.2887, 0.1203
# 32, uniform => 0.1700, 0.1343
# 32, distance => 0.2912, 0.1216

In [61]:
print("Starting!")
knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.5392
Test accuracy : 0.1664


For the search space of 32 nearest neighbours we get around 12% success rate on our test data (which is around 31436 words). I have listed other parameters and their resulting percentages in the comments in the code cell. Also worth noting is that "StandardScaler" only worsens (not tested on skype) our predictions.

Now let's try our luck with phonemes:

In [62]:
train_set, train_labels = get_labels(two_sentence_train, label=["phonemes"])
test_set, test_labels = get_labels(two_sentence_test, label=["phonemes"])

train_labels, test_labels, _ = prepare_labels(train_labels, test_labels, label=["phonemes"])

  return f(*args, **kwargs)


In [32]:
parameters = {'n_neighbors':[16,32], 'weights':['uniform', 'distance'], 'n_jobs':[-1]}

In [33]:
orig_clf = KNeighborsClassifier()
gscv_clf = GridSearchCV(orig_clf, parameters, n_jobs = -1, cv=cv_dialect_splitter())

In [34]:
print("Starting!")
gscv_clf.fit(train_set, train_labels)
print("Finished!")

print("Best: %f using %s" % (gscv_clf.best_score_, gscv_clf.best_params_))
print(f"Train accuracy: {gscv_clf.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {gscv_clf.score(test_set, test_labels):.4f}")

Starting!
Finished!
Best: 0.136635 using {'n_jobs': -1, 'n_neighbors': 32, 'weights': 'uniform'}
Train accuracy: 0.1700
Test accuracy : 0.1343


In [63]:
knn_clf_pipeline = Pipeline(
    [
        (
            "clf",
            KNeighborsClassifier(32, weights='distance', n_jobs=-1)
        ),
    ]
)
# 20, distance => 0.3221, 0.1377
# 32, uniform => 0.2093, 0.1574
# 32, distance => 0.3265, 0.1410

In [64]:
print("Starting!")
knn_clf_pipeline.fit(train_set, train_labels)
print("Finished!")

print(f"Train accuracy: {knn_clf_pipeline.score(train_set, train_labels):.4f}")
print(f"Test accuracy : {knn_clf_pipeline.score(test_set, test_labels):.4f}")

Starting!
Finished!
Train accuracy: 0.5287
Test accuracy : 0.1586
