In [2]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import pickle
import math

from pprint import pprint
from datetime import datetime

Uncomment the next cell if running for the first time.

In [2]:
# !wget https://userinterfaces.aalto.fi/136Mkeystrokes/resources/136m-keystrokes.zip
# !unzip 136m-keystrokes.zip
# !mv Keystrokes/files data/

In [4]:
# Set seed for reproducibility
np.random.seed(58)

In [4]:
data_folder = 'data'
output_folder = 'preprocessed_feather'

# Get participants list from metadata file (quicker than listing subfolder!!!)
participants = pd.read_table(f'{data_folder}/metadata_participants.txt')['PARTICIPANT_ID']

# remove participant 3 (does not exist) and 127969 as it has an non-parseable file!!!
participants = participants.drop(participants[participants == 3].index)
participants = participants.drop(participants[participants == 127969].index)

participants = np.array(participants)

In [5]:
## Feature extraction
# Hold Latency (HL)
# Inter-key latency (IL)
# Press latency (PL)
# Release latency (RL)

def keystroke_seq_feat_extr(keystroke_seq_section):
    # This function assumes that there is a single test section per participant
    keystroke_seq_section = keystroke_seq_section.copy(deep=False)

    keystroke_seq_section['HL'] = keystroke_seq_section['RELEASE_TIME'] - keystroke_seq_section['PRESS_TIME']
    keystroke_seq_section['IL'] = keystroke_seq_section['PRESS_TIME'].shift(-1) - keystroke_seq_section['RELEASE_TIME']
    keystroke_seq_section['PL'] = keystroke_seq_section['HL'] + keystroke_seq_section['IL']
    keystroke_seq_section['RL'] = keystroke_seq_section['RELEASE_TIME'].shift(-1) - keystroke_seq_section['RELEASE_TIME']

    # There is a NaN for the last keystroke due to shifting, we can drop it
    keystroke_seq_section.dropna(inplace=True)

    # Drop features we are not going to need anymore (we already used them to calculate the new features)
    keystroke_seq_section.drop(['PRESS_TIME', 'RELEASE_TIME', 'TEST_SECTION_ID'], axis='columns', inplace=True)

    # Normalize the features as described in the paper
    keystroke_seq_section[['HL', 'IL', 'PL', 'RL']] = keystroke_seq_section[['HL', 'IL', 'PL', 'RL']].div(1000)
    keystroke_seq_section['KEYCODE'] = keystroke_seq_section['KEYCODE'].div(255)

    # Clip values bigger than one or lower than zero
    keystroke_seq_section[['HL', 'IL', 'PL', 'RL', 'KEYCODE']] = keystroke_seq_section[['HL', 'IL', 'PL', 'RL', 'KEYCODE']].clip(lower=0, upper=1)

    return keystroke_seq_section

In [6]:
os.makedirs(output_folder, exist_ok=True)

def preprocess_participant(participant_id):
    try:
        # ISO-8859-15, encoding that supports finnish characters as the dataset contains finnish participants.
        participant_data = pd.read_table(f'{data_folder}/{participant_id}_keystrokes.txt', encoding='ISO-8859-15', quoting=csv.QUOTE_NONE, usecols=[
                                        'TEST_SECTION_ID', 'KEYCODE', 'PRESS_TIME', 'RELEASE_TIME'], on_bad_lines='skip')
        
        participant_data = participant_data.groupby('TEST_SECTION_ID')
        for section_id in participant_data.groups:
            group_dataframe = participant_data.get_group(section_id)
            features = keystroke_seq_feat_extr(group_dataframe)
            
            os.makedirs(f'{output_folder}/{participant_id}/', exist_ok=True)
            features.reset_index().to_feather(
                f'{output_folder}/{participant_id}/{participant_id}_section_{section_id}_keystrokes.feather')

        with open(f'{output_folder}/METADATA_participant_sections.tsv', 'a') as f:
            f.write(f'{participant_id}\t{list(participant_data.groups.keys())}\n')
    except Exception as e:
        print(e)

In [7]:
from joblib import Parallel, delayed

Parallel(n_jobs=-1, verbose=1)(delayed(preprocess_participant)(participant_id) for participant_id in participants)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   50.0s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 11234 tasks      |

[None,
 None,
 ...]

In [None]:
pd.read_table(f'preprocessed_feather/METADATA_participant_sections.tsv')

Unnamed: 0,5,"[7, 10, 15, 21, 25, 29, 33, 39, 43, 48, 52, 60, 71, 73, 83]"
0,23,"[113, 128, 142, 148, 158, 177, 202, 231, 256, ..."
1,7,"[11, 24, 35, 42, 50, 68, 85, 97, 111, 114, 126..."
2,24,"[116, 130, 137, 143, 147, 159, 167, 176, 183, ..."
3,32,"[173, 192, 223, 241, 261, 276, 293, 307, 318, ..."
4,30,"[144, 163, 211, 243, 349, 428, 517, 583, 615, ..."
...,...,...
168178,517928,"[5579266, 5579278, 5579284, 5579288, 5579293, ..."
168179,517936,"[5579367, 5579377, 5579393, 5579397, 5579404, ..."
168180,517943,"[5579477, 5579482, 5579488, 5579493, 5579498, ..."
168181,517944,"[5579484, 5579490, 5579492, 5579497, 5579503, ..."


In [10]:
pd.read_feather('preprocessed_feather/517928/517928_section_5579266_keystrokes.feather', columns=['KEYCODE', 'HL', 'IL', 'PL', 'RL'])

Unnamed: 0,KEYCODE,HL,IL,PL,RL
0,0.329412,0.152,0.0,0.152,0.144
1,0.282353,0.144,0.0,0.096,0.081
2,0.270588,0.129,0.0,0.104,0.159
3,0.321569,0.184,0.0,0.112,0.128
4,0.270588,0.2,0.0,0.16,0.128
5,0.12549,0.168,0.0,0.168,0.167
6,0.341176,0.167,0.097,0.264,0.224
7,0.270588,0.127,0.0,0.056,0.224
8,0.321569,0.295,0.0,0.191,0.001
9,0.12549,0.105,0.56,0.665,0.744
