# Data Processing

Change the data_path to your directory as needed

1.   key: the key pressed
2.   response_id: unique identifying code for this participant typing this
target sentence
3. response_content: response for this sentence as typed in full
4. participant_id: unique ID for participant
5. sentence_id: identifier for target sentence
6. sentence_content: target sentence as displayed to the participant
7. diagnosis:
> *   1=Typist has had a diagnosis of Parkinson's disease;
> * 0=Typist has not had a diagnosis of Parkinson's disease
8. keydown: Timestamp for press of key in milliseconds
9. keyup: Timestamp for release of key in milliseconds

In [20]:
import pandas as pd
import numpy as np
import os
import re
import json

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
#add path to data
data_path = '/content/drive/My Drive/266 Assignments/266 Final Project'

files = os.listdir(data_path)
files = [x for x in files if '.csv' in x]
files

['online_english_fold_all.csv',
 'MedicationInfo.csv',
 'CoNLL_2020_Online_English.csv',
 'cleaned_data.csv']

## Data processing
*   Create key_duration_ms column, which is keyup - keydown
*   Replace non-character keys (e.g. shift, enter, space, backspace, control) with special tokens

In [22]:
df = pd.read_csv(os.path.join(data_path, files[files.index('CoNLL_2020_Online_English.csv')]))
df['key_duration_ms'] = df['keyup'] - df['keydown']
df

Unnamed: 0,key,response_id,response_content,participant_id,sentence_id,sentence_content,diagnosis,keydown,keyup,key_duration_ms
0,shift,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,80077.6719,80672.2719,594.600
1,b,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,80511.5719,80615.2719,103.700
2,o,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,80800.2719,80861.4719,61.200
3,o,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,80949.9719,81076.3719,126.400
4,s,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,81305.5719,81480.7719,175.200
...,...,...,...,...,...,...,...,...,...,...
437910,r,2429,Over three million cattle are residents of the...,1167,15,Over three million cattle are residents of the...,0,709861.2300,709965.3350,104.105
437911,k,2429,Over three million cattle are residents of the...,1167,15,Over three million cattle are residents of the...,0,709997.2450,710077.2900,80.045
437912,e,2429,Over three million cattle are residents of the...,1167,15,Over three million cattle are residents of the...,0,710165.2550,710309.3200,144.065
437913,t,2429,Over three million cattle are residents of the...,1167,15,Over three million cattle are residents of the...,0,710413.2600,710509.4550,96.195


In [23]:
# #check distribution of key times per one response
# df.loc[df['response_id']==53, 'key_duration_ms'].plot.hist()

In [24]:
key_vocab = df['key'].unique()

#regex pattern
# pattern = r"^[\w,!.@#$%^}\&_{='/:);(|[\]]$"
pattern = r"^[\w,!.@#$%^}\&_{='/:);(|[\]\"'\\\\-]$"

#include only alphanumeric characters and special characters
vocab_subset = []

for i in key_vocab:
  matches = re.findall(pattern, i)
  vocab_subset.extend(matches)

vocab_subset.sort()

In [25]:
remaining_vocab = list(set(key_vocab) - set(vocab_subset))
remaining_vocab

['control',
 'meta',
 'alt',
 'home',
 'tab',
 'process',
 'arrowleft',
 'end',
 '\n',
 'unidentified',
 'enter',
 'shift',
 'capslock',
 'arrowright',
 'f16',
 ' ',
 'arrowup',
 'contextmenu',
 'pagedown',
 'mediaprevioustrack',
 'numlock',
 'backspace',
 'insert',
 'spacebar',
 'arrowdown',
 'delete',
 'f11']

In [26]:
#both ' ' and 'spacebar' are classed as '[SPCE]'
df.loc[df['key'] == ' ', 'key'] = 'spacebar'

#get all punctuation and alphanumeric characters
def clean_key(key):
    if key in vocab_subset:
        return key
    elif key in remaining_vocab:
        return '[' + key +']'
    else:
        return None
df['key_clean'] = df['key'].apply(clean_key)


df

Unnamed: 0,key,response_id,response_content,participant_id,sentence_id,sentence_content,diagnosis,keydown,keyup,key_duration_ms,key_clean
0,shift,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,80077.6719,80672.2719,594.600,[shift]
1,b,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,80511.5719,80615.2719,103.700,b
2,o,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,80800.2719,80861.4719,61.200,o
3,o,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,80949.9719,81076.3719,126.400,o
4,s,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,81305.5719,81480.7719,175.200,s
...,...,...,...,...,...,...,...,...,...,...,...
437910,r,2429,Over three million cattle are residents of the...,1167,15,Over three million cattle are residents of the...,0,709861.2300,709965.3350,104.105,r
437911,k,2429,Over three million cattle are residents of the...,1167,15,Over three million cattle are residents of the...,0,709997.2450,710077.2900,80.045,k
437912,e,2429,Over three million cattle are residents of the...,1167,15,Over three million cattle are residents of the...,0,710165.2550,710309.3200,144.065,e
437913,t,2429,Over three million cattle are residents of the...,1167,15,Over three million cattle are residents of the...,0,710413.2600,710509.4550,96.195,t


In [27]:
clean_vocab = list(df['key_clean'].unique())
#get a list of the special keys
special_tokens = []
for i in clean_vocab:
  if '[' in i and ']' in i:
    special_tokens.append(i)

special_tokens

['[shift]',
 '[backspace]',
 '[spacebar]',
 '[enter]',
 '[\n]',
 '[unidentified]',
 '[arrowleft]',
 '[arrowright]',
 '[arrowdown]',
 '[capslock]',
 '[delete]',
 '[numlock]',
 '[insert]',
 '[f11]',
 '[meta]',
 '[control]',
 '[arrowup]',
 '[mediaprevioustrack]',
 '[alt]',
 '[end]',
 '[pagedown]',
 '[tab]',
 '[f16]',
 '[process]',
 '[home]',
 '[contextmenu]']

In [28]:
unused_token_slots = [f"[unused{i}]" for i in range(1, len(special_tokens) + 1)]

charbert_token_map = dict(zip(special_tokens, unused_token_slots))
charbert_token_map

{'[shift]': '[unused1]',
 '[backspace]': '[unused2]',
 '[spacebar]': '[unused3]',
 '[enter]': '[unused4]',
 '[\n]': '[unused5]',
 '[unidentified]': '[unused6]',
 '[arrowleft]': '[unused7]',
 '[arrowright]': '[unused8]',
 '[arrowdown]': '[unused9]',
 '[capslock]': '[unused10]',
 '[delete]': '[unused11]',
 '[numlock]': '[unused12]',
 '[insert]': '[unused13]',
 '[f11]': '[unused14]',
 '[meta]': '[unused15]',
 '[control]': '[unused16]',
 '[arrowup]': '[unused17]',
 '[mediaprevioustrack]': '[unused18]',
 '[alt]': '[unused19]',
 '[end]': '[unused20]',
 '[pagedown]': '[unused21]',
 '[tab]': '[unused22]',
 '[f16]': '[unused23]',
 '[process]': '[unused24]',
 '[home]': '[unused25]',
 '[contextmenu]': '[unused26]'}

# Export Special tokens as JSON

In [29]:
#export unused token map
# Convert and write JSON object to file
file_path = os.path.join(data_path, "token_map.json")
with open(file_path, "w") as outfile:
    json.dump(charbert_token_map, outfile)

In [30]:
df['charbert_key'] = df['key_clean'].map(charbert_token_map)
df['charbert_key'] = df['charbert_key'].fillna(df['key_clean'])
df

Unnamed: 0,key,response_id,response_content,participant_id,sentence_id,sentence_content,diagnosis,keydown,keyup,key_duration_ms,key_clean,charbert_key
0,shift,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,80077.6719,80672.2719,594.600,[shift],[unused1]
1,b,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,80511.5719,80615.2719,103.700,b,b
2,o,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,80800.2719,80861.4719,61.200,o,o
3,o,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,80949.9719,81076.3719,126.400,o,o
4,s,39,"Books include Penguin Island, a satire on the ...",11,1,"Books include Penguin Island, a satire on the ...",1,81305.5719,81480.7719,175.200,s,s
...,...,...,...,...,...,...,...,...,...,...,...,...
437910,r,2429,Over three million cattle are residents of the...,1167,15,Over three million cattle are residents of the...,0,709861.2300,709965.3350,104.105,r,r
437911,k,2429,Over three million cattle are residents of the...,1167,15,Over three million cattle are residents of the...,0,709997.2450,710077.2900,80.045,k,k
437912,e,2429,Over three million cattle are residents of the...,1167,15,Over three million cattle are residents of the...,0,710165.2550,710309.3200,144.065,e,e
437913,t,2429,Over three million cattle are residents of the...,1167,15,Over three million cattle are residents of the...,0,710413.2600,710509.4550,96.195,t,t


In [31]:
sentences = df['sentence_id'].unique()
participants = df['participant_id'].unique()


new_data = []

for i in participants:
    participant_df = df[df['participant_id'] == i]
    diagnosis = participant_df['diagnosis'].unique()[0]

    for j in participant_df['sentence_id'].unique():
        keys = participant_df[participant_df['sentence_id'] == j]['charbert_key'].str.cat(sep=' ')
        timings = participant_df[participant_df['sentence_id'] == j]['key_duration_ms'].to_list()
        # keys = str(keys)
        response_id = participant_df[(participant_df['sentence_id'] == j)]['response_id'].unique()
        response_content = participant_df[(participant_df['sentence_id'] == j)]['response_content'].unique()
        sentence_content = participant_df[(participant_df['sentence_id'] == j)]['sentence_content'].unique()

        if len(keys) != 0 and len(response_id) != 0 and len(response_content) != 0 and len(sentence_content) != 0:
            new_data.append({
                'participant_id': i,
                'sentence_id': j,
                'key_sequence': keys,
                'timing_sequence': timings,
                'diagnosis': diagnosis,
                'response_id': response_id[0],
                'response_content': response_content[0],
                'sentence_content': sentence_content[0]
            })
        else:
            print(f"Missing data for participant {i}, sentence {j}")

new_df = pd.DataFrame(new_data)
new_df

Unnamed: 0,participant_id,sentence_id,key_sequence,timing_sequence,diagnosis,response_id,response_content,sentence_content
0,11,1,[unused1] b o o s [unused2] k s s [unused3] i ...,"[594.6000000000058, 103.70000000001164, 61.199...",1,39,"Books include Penguin Island, a satire on the ...","Books include Penguin Island, a satire on the ..."
1,11,2,"[unused1] h o w e v e r , [unused3] r e l i g ...","[328.5, 31.600000000002183, 76.79999999999927,...",1,37,"However, religions other than Islam, use a dif...","However, religions other than Islam, use a dif..."
2,11,3,[unused1] t h e [unused3] [unused1] f r a n k ...,"[448.3999999999796, 114.30000000001746, 97.200...",1,41,The Franks alliance was important exactly beca...,The Franks alliance was important exactly beca...
3,11,4,[unused1] h s [unused2] e [unused3] i s [unuse...,"[255.8000000000029, 140.29999999999563, 65.0, ...",1,38,"He is buried in Egypt, Aswan at the Mausoleum...","He is buried in Egypt, Aswan at the Mausoleum ..."
4,11,5,[unused1] t h e [unused3] w - s h a p e d [unu...,"[470.3000000000029, 145.60000000000582, 126.69...",1,40,The w-shaped glpyh above the second consonant ...,The w-shaped glyph above the second consonant ...
...,...,...,...,...,...,...,...,...
3429,1167,11,[unused1] t h e [unused3] n o v e l [unused3] ...,"[528.0299999279669, 80.06000006600516, 56.0749...",0,2422,The novel explores the relationship between P...,The novel explores the relationship between Pa...
3430,1167,12,[unused1] s p l i t - f i n g e r [unused3] a ...,"[568.1600000238977, 160.4300000470248, 71.9849...",0,2424,Split-finger aiming requires the archer to pla...,Split-finger aiming requires the archer to pla...
3431,1167,13,[unused1] t h e y [unused3] f o u g h t [unuse...,"[392.04499998595566, 72.10500002803747, 88.035...",0,2427,They fought a thirty years war on the side of ...,They fought a thirty years war on the side of ...
3432,1167,14,"[unused1] h o w e v e r , [unused3] t h e r e ...","[576.0049999230541, 88.08000001590699, 72.1200...",0,2428,"However, there is no evidence that those tatto...","However, there is no evidence that those tatto..."


In [32]:
#filter out response_content that is NaN
filt_df = new_df.loc[~new_df['response_content'].isnull()]
filt_df

Unnamed: 0,participant_id,sentence_id,key_sequence,timing_sequence,diagnosis,response_id,response_content,sentence_content
0,11,1,[unused1] b o o s [unused2] k s s [unused3] i ...,"[594.6000000000058, 103.70000000001164, 61.199...",1,39,"Books include Penguin Island, a satire on the ...","Books include Penguin Island, a satire on the ..."
1,11,2,"[unused1] h o w e v e r , [unused3] r e l i g ...","[328.5, 31.600000000002183, 76.79999999999927,...",1,37,"However, religions other than Islam, use a dif...","However, religions other than Islam, use a dif..."
2,11,3,[unused1] t h e [unused3] [unused1] f r a n k ...,"[448.3999999999796, 114.30000000001746, 97.200...",1,41,The Franks alliance was important exactly beca...,The Franks alliance was important exactly beca...
3,11,4,[unused1] h s [unused2] e [unused3] i s [unuse...,"[255.8000000000029, 140.29999999999563, 65.0, ...",1,38,"He is buried in Egypt, Aswan at the Mausoleum...","He is buried in Egypt, Aswan at the Mausoleum ..."
4,11,5,[unused1] t h e [unused3] w - s h a p e d [unu...,"[470.3000000000029, 145.60000000000582, 126.69...",1,40,The w-shaped glpyh above the second consonant ...,The w-shaped glyph above the second consonant ...
...,...,...,...,...,...,...,...,...
3429,1167,11,[unused1] t h e [unused3] n o v e l [unused3] ...,"[528.0299999279669, 80.06000006600516, 56.0749...",0,2422,The novel explores the relationship between P...,The novel explores the relationship between Pa...
3430,1167,12,[unused1] s p l i t - f i n g e r [unused3] a ...,"[568.1600000238977, 160.4300000470248, 71.9849...",0,2424,Split-finger aiming requires the archer to pla...,Split-finger aiming requires the archer to pla...
3431,1167,13,[unused1] t h e y [unused3] f o u g h t [unuse...,"[392.04499998595566, 72.10500002803747, 88.035...",0,2427,They fought a thirty years war on the side of ...,They fought a thirty years war on the side of ...
3432,1167,14,"[unused1] h o w e v e r , [unused3] t h e r e ...","[576.0049999230541, 88.08000001590699, 72.1200...",0,2428,"However, there is no evidence that those tatto...","However, there is no evidence that those tatto..."


# Export Cleaned Data

In [33]:
filt_df.to_csv(os.path.join(data_path, 'cleaned_data.csv'), index=False)