# Data Processing

Change the data_path to your directory as needed

1.   key: the key pressed
2.   response_id: unique identifying code for this participant typing this
target sentence
3. response_content: response for this sentence as typed in full
4. participant_id: unique ID for participant
5. sentence_id: identifier for target sentence
6. sentence_content: target sentence as displayed to the participant
7. diagnosis:
> *   1=Typist has had a diagnosis of Parkinson's disease;
> * 0=Typist has not had a diagnosis of Parkinson's disease
8. keydown: Timestamp for press of key in milliseconds
9. keyup: Timestamp for release of key in milliseconds

In [1]:
import pandas as pd
import numpy as np
import os
import re
import json

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#add path to data
data_path = '/content/drive/My Drive/266 Assignments/266 Final Project'

files = os.listdir(data_path)
files = [x for x in files if '.csv' in x]
files

['online_english_fold_all.csv',
 'MedicationInfo.csv',
 'CoNLL_2020_Online_English.csv',
 'cleaned_data.csv',
 'depreciated_wrongAUC_calc_metrics.csv',
 'analyzed_metrics.csv',
 'cleaned_data_with_ft.csv',
 'metrics.csv']

## Get Model metrics

In [3]:
df = pd.read_csv(os.path.join(data_path, files[files.index('metrics.csv')]))
df

Unnamed: 0,model name,timestamp,accuracy,precision,recall,f1,auc
0,CBOW-LSTM on characters & timing,2023-12-03 03:29:07,0.525,0.576471,0.417021,0.483951,0.490955
1,CBOW-LSTM on characters & timing,2023-12-03 03:30:27,0.504545,0.532258,0.564103,0.547718,0.503029
2,CBOW-LSTM on characters & timing,2023-12-03 03:33:14,0.495455,0.560748,0.255319,0.350877,0.536399
3,BERT on characters,2023-12-03 03:47:51,0.46712,0.0,0.0,0.0,
4,BERT on characters,2023-12-03 03:53:43,0.53288,0.53288,1.0,0.695266,
5,BERT on characters,2023-12-03 03:59:42,0.469388,0.555556,0.021277,0.040984,
6,BERT on characters,2023-12-03 04:06:30,0.53288,0.53288,1.0,0.695266,0.527866
7,BERT on characters,2023-12-03 04:14:25,0.53288,0.53288,1.0,0.695266,0.495135
8,BERT on characters,2023-12-03 04:21:33,0.53288,0.53288,1.0,0.695266,0.421865
9,BERT on words,2023-12-03 04:32:01,0.53288,0.53288,1.0,0.695266,0.46289


In [11]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

df = df.sort_values(by=['model name', 'timestamp'], ascending=[True, False])

df2 = df.groupby('model name').head(3)
df2

Unnamed: 0,model name,timestamp,accuracy,precision,recall,f1,auc
8,BERT on characters,2023-12-03 04:21:33,0.53288,0.53288,1.0,0.695266,0.421865
7,BERT on characters,2023-12-03 04:14:25,0.53288,0.53288,1.0,0.695266,0.495135
6,BERT on characters,2023-12-03 04:06:30,0.53288,0.53288,1.0,0.695266,0.527866
11,BERT on words,2023-12-03 04:44:23,0.46712,0.0,0.0,0.0,0.526451
10,BERT on words,2023-12-03 04:38:15,0.46712,0.0,0.0,0.0,0.518002
9,BERT on words,2023-12-03 04:32:01,0.53288,0.53288,1.0,0.695266,0.46289
21,BERT-LSTM on characters,2023-12-03 05:46:34,0.53288,0.53288,1.0,0.695266,0.519882
13,BERT-LSTM on characters,2023-12-03 05:04:34,0.46712,0.0,0.0,0.0,0.510205
12,BERT-LSTM on characters,2023-12-03 04:58:43,0.46712,0.0,0.0,0.0,0.513706
51,BERT-LSTM on characters & flight time,2023-12-06 01:27:51,0.687075,0.929204,0.446809,0.603448,0.840054


In [9]:
# # Convert 'date_time_stamp' to datetime if it's not already in datetime format
# data['date_time_stamp'] = pd.to_datetime(data['date_time_stamp'])

# # Sort the DataFrame by 'model' and 'date_time_stamp' in descending order
# data = data.sort_values(by=['model', 'date_time_stamp'], ascending=[True, False])

# # Group by 'model' and select the top 3 rows for each group
# result = data.groupby('model').head(3)


In [10]:
# #drop first three rows of BERT on characters where auc was not exported and first three rows of Chars2Vec-LSTM.
# indices_to_exclude = [3, 4, 5, 27, 28, 29, 33, 34, 35, 36]

# df2 = df[~df.index.isin(indices_to_exclude)]
# df2

In [13]:
# new_data = {}

# df2.index = [
#     'baseline',
#     'BERT on characters',
#     'BERT-LSTM on characters',
#     'BERT-LSTM on characters & flight time',
#     'BERT on words',
#     'BERT-LSTM on words',
#     'BERT-LSTM on words & flight time',
#     'BERT-LSTM on characters, words & flight time',
#     'CBOW-LSTM on characters & flight time',
#     'Chars2Vec-LSTM on characters & flight time'
# ]

# for column in df2.columns:
#   mean_std = df[column].agg(['mean', 'std'])
#   new_data[column] = [f"{mean_std['mean']:.3f} ({mean_std['std']:.3f})"]

# new_df = pd.DataFrame(new_data)

# new_df

In [15]:
#get mean & stdev

agg_df = df2.groupby('model name').agg({'accuracy': ['mean', 'std'],
                                        'precision': ['mean', 'std'],
                                        'recall': ['mean', 'std'],
                                        'f1': ['mean', 'std'],
                                        'auc': ['mean', 'std']})

custom_order = ['baseline',
                'BERT on characters',
                'BERT-LSTM on characters',
                'BERT-LSTM on characters & flight time',
                'BERT on words',
                'BERT-LSTM on words',
                'BERT-LSTM on words & flight time',
                'BERT-LSTM on characters, words & flight time',
                'CBOW-LSTM on characters & flight time']
                # 'Chars2Vec-LSTM on characters & timing']

# len(agg_df['model name'])
# agg_df['model name'] = pd.Categorical(df['model name'], categories=custom_order, ordered=True)
agg_df = agg_df.reindex(custom_order)
agg_df.columns = ['_'.join(col) for col in agg_df.columns]
agg_df

Unnamed: 0_level_0,accuracy_mean,accuracy_std,precision_mean,precision_std,recall_mean,recall_std,f1_mean,f1_std,auc_mean,auc_std
model name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
baseline,0.483749,0.041151,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
BERT on characters,0.53288,0.0,0.53288,0.0,1.0,0.0,0.695266,0.0,0.481622,0.054277
BERT-LSTM on characters,0.48904,0.037966,0.177627,0.307658,0.333333,0.57735,0.231755,0.401412,0.514598,0.0049
BERT-LSTM on characters & flight time,0.699924,0.016715,0.939673,0.011047,0.466667,0.027358,0.62342,0.026612,0.845975,0.008028
BERT on words,0.48904,0.037966,0.177627,0.307658,0.333333,0.57735,0.231755,0.401412,0.502448,0.034518
BERT-LSTM on words,0.486772,0.023819,0.515982,0.026564,0.4,0.204565,0.431459,0.160339,0.512453,0.024514
BERT-LSTM on words & flight time,0.668934,0.014869,0.905816,0.021451,0.422695,0.024197,0.576172,0.024669,0.838167,0.005503
"BERT-LSTM on characters, words & flight time",0.70446,0.022713,0.933648,0.032103,0.479433,0.031939,0.63325,0.032881,0.845259,0.003567
CBOW-LSTM on characters & flight time,0.831818,0.010415,0.918511,0.015919,0.751403,0.010468,0.826557,0.010816,0.848485,0.006122


In [16]:
new_data = {}
for metric in ['accuracy', 'precision', 'recall', 'f1', 'auc']:
    mean_col = f'{metric}_mean'
    std_col = f'{metric}_std'
    new_data[metric] = agg_df[mean_col].apply(lambda x: f"{x:.3f}") + " (" + agg_df[std_col].apply(lambda x: f"{x:.3f}") + ")"

new_df = pd.DataFrame(new_data)

new_df

Unnamed: 0_level_0,accuracy,precision,recall,f1,auc
model name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
baseline,0.484 (0.041),0.000 (0.000),0.000 (0.000),0.000 (0.000),0.500 (0.000)
BERT on characters,0.533 (0.000),0.533 (0.000),1.000 (0.000),0.695 (0.000),0.482 (0.054)
BERT-LSTM on characters,0.489 (0.038),0.178 (0.308),0.333 (0.577),0.232 (0.401),0.515 (0.005)
BERT-LSTM on characters & flight time,0.700 (0.017),0.940 (0.011),0.467 (0.027),0.623 (0.027),0.846 (0.008)
BERT on words,0.489 (0.038),0.178 (0.308),0.333 (0.577),0.232 (0.401),0.502 (0.035)
BERT-LSTM on words,0.487 (0.024),0.516 (0.027),0.400 (0.205),0.431 (0.160),0.512 (0.025)
BERT-LSTM on words & flight time,0.669 (0.015),0.906 (0.021),0.423 (0.024),0.576 (0.025),0.838 (0.006)
"BERT-LSTM on characters, words & flight time",0.704 (0.023),0.934 (0.032),0.479 (0.032),0.633 (0.033),0.845 (0.004)
CBOW-LSTM on characters & flight time,0.832 (0.010),0.919 (0.016),0.751 (0.010),0.827 (0.011),0.848 (0.006)


In [17]:
new_df.to_csv(os.path.join(data_path, "analyzed_metrics_ft.csv"))