In [61]:
%autosave 0
import pandas as pd
import numpy as np
import sklearn.preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="white", palette="muted", color_codes=True)
pd.set_option("display.max_rows", 100)

Autosave disabled


## Import data

In [62]:
filename_raw = "data-2017-05-11-0947.csv"
df = pd.read_csv(filename_raw, low_memory=True, header=0)
display(df.head(n=2))
display(df.tail(n=2))

Unnamed: 0,id,interface,source,task,timestamp,uuid,value
0,other-72,rsvp,image,easy,1492633914824,379307c1-1320-44c5-826a-194c6ae3c763,
1,other-530,rsvp,image,easy,1492633914926,379307c1-1320-44c5-826a-194c6ae3c763,


Unnamed: 0,id,interface,source,task,timestamp,uuid,value
46287,other-415,rsvp,image,hard,1494357927141,b834995d-3bd4-4361-9b36-30ceb0b694a0,
46288,hard-30,rsvp,image,hard,1494357927141,b834995d-3bd4-4361-9b36-30ceb0b694a0,


In [63]:
len(df["uuid"].unique())

27

## Preprocessing

- reorder columns:
```
timetstamp, uuid, interface, task, source, id, value
```

- later, process columns to combine the `id` and `value` columns.
- filter out observations from when the interface had a data collection issue.
- filter out users that did not complete all tasks

In [64]:
def reorder_columns(df):
    cols = ["timestamp", "uuid", "interface", "task", "source", "id", "value"]
    df = df[cols]
    return df
df = reorder_columns(df)

## Exploration

Summary of how many images people saw, by user, interface, and task.

In [65]:
(df
 .query("source == 'image'")
 .groupby(["uuid", "interface", "task"])
 .count())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,timestamp,source,id,value
uuid,interface,task,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12b1d3dc-228b-4bf9-b145-639a9c4b6bee,rsvp,easy,240,240,240,0
12b1d3dc-228b-4bf9-b145-639a9c4b6bee,rsvp,hard,240,240,240,0
12b1d3dc-228b-4bf9-b145-639a9c4b6bee,rsvp,medium,240,240,240,0
12b1d3dc-228b-4bf9-b145-639a9c4b6bee,traditional,easy,240,240,240,0
12b1d3dc-228b-4bf9-b145-639a9c4b6bee,traditional,hard,240,240,240,0
12b1d3dc-228b-4bf9-b145-639a9c4b6bee,traditional,medium,240,240,240,0
36b75c12-970e-43fd-b129-5960c2c10675,rsvp,easy,240,240,240,0
36b75c12-970e-43fd-b129-5960c2c10675,rsvp,hard,240,240,240,0
36b75c12-970e-43fd-b129-5960c2c10675,rsvp,medium,240,240,240,0
379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,240,240,240,0


Number of users who saw n=240 images for each interface/task combination.

In [66]:
n_images = 240
n_interfaces = 2
n_tasks = 3
n = n_images * n_interfaces * n_tasks

In [67]:
def set_of_users_who_saw_tasks(symbol):
    return (df
            .query("source == 'image'")
            .groupby("uuid")
            .count()
            .query("id {symbol} @n".format(symbol=symbol))
            [["id"]]
            .rename(columns={"id":"count"}))

In [68]:
# Set of users who saw exactly n=240 for each interface/task
set_of_users_who_saw_tasks('==')

Unnamed: 0_level_0,count
uuid,Unnamed: 1_level_1
12b1d3dc-228b-4bf9-b145-639a9c4b6bee,1440
54dcb6dc-be27-497f-a68f-821dfeb6ca96,1440
716faa5b-3528-4ea2-bb9e-e3d8340608f0,1440
797f7757-5ad7-4c98-9ff5-a6e5d6d2eb0d,1440
7a803995-3b85-49f2-924b-9d32d2425807,1440
7be5a317-6964-4ea6-8d93-f3bb16b10cd6,1440
b7c650c1-c651-413f-912a-72e7491579ed,1440
fecd21b6-88f4-4abe-b118-fac3a57dd429,1440
ffc49fa7-843c-4678-9e94-c2d3f54f2317,1440


In [69]:
# Set of users who saw *less than* n=240 for each interface/task
set_of_users_who_saw_tasks('<')

Unnamed: 0_level_0,count
uuid,Unnamed: 1_level_1
36b75c12-970e-43fd-b129-5960c2c10675,720
379307c1-1320-44c5-826a-194c6ae3c763,720
3e040e04-6e89-4765-9079-085c51b0ad18,1328
4d5c2ff2-1794-4ec2-b8a1-ef7c02543967,1200
5fb56330-3c5f-4243-924c-e5457789918b,1328
83910ff3-b87a-45b1-b45f-0ce3a2464dc5,1328
95e9f4d4-97e6-4cea-a80c-b62ed866e79b,1434
967b4b18-ca20-48de-a642-9c6a26f28284,720
ab2a8422-0c53-4d49-9a60-cdf810b4a89e,240
b5ac1c45-d258-441f-aeee-b574c2eb8005,1328


In [70]:
# Set of users who saw *more than* n=240 for each interface/task
set_of_users_who_saw_tasks('>')

Unnamed: 0_level_0,count
uuid,Unnamed: 1_level_1
aacc44cd-8f82-458b-a65b-b94b43df5ab9,1715
b834995d-3bd4-4361-9b36-30ceb0b694a0,1610


In [71]:
# We can see that there are two uuids, both with less than n tasks, that add up to n.
# Were they the same user, with a refreshed browser perhaps?
# Verdict: Unfortunately, both uuids are just associated with rsvp.
ids = ["36b75c12-970e-43fd-b129-5960c2c10675", "cca6cbfb-7bd1-4307-bfea-8d8c212cfceb"]
(df
 .loc[(df["source"] == "image") & (df["uuid"].isin(ids)), :]
 .groupby(["uuid", "interface", "task"])
 .count()
 [["id"]]
 .rename(columns={"id":"count"}))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
uuid,interface,task,Unnamed: 3_level_1
36b75c12-970e-43fd-b129-5960c2c10675,rsvp,easy,240
36b75c12-970e-43fd-b129-5960c2c10675,rsvp,hard,240
36b75c12-970e-43fd-b129-5960c2c10675,rsvp,medium,240
cca6cbfb-7bd1-4307-bfea-8d8c212cfceb,rsvp,easy,240
cca6cbfb-7bd1-4307-bfea-8d8c212cfceb,rsvp,hard,240
cca6cbfb-7bd1-4307-bfea-8d8c212cfceb,rsvp,medium,240


## Tranform to combine id and value columns

In [72]:
df = df.copy()
inds = df["id"].notnull()
df.loc[inds, "value"] = df.loc[inds, "id"]
del df["id"]

In [73]:
df.tail()

Unnamed: 0,timestamp,uuid,interface,task,source,value
46284,1494357927038,b834995d-3bd4-4361-9b36-30ceb0b694a0,rsvp,hard,image,other-484
46285,1494357927038,b834995d-3bd4-4361-9b36-30ceb0b694a0,rsvp,hard,image,other-481
46286,1494357927141,b834995d-3bd4-4361-9b36-30ceb0b694a0,rsvp,hard,image,other-284
46287,1494357927141,b834995d-3bd4-4361-9b36-30ceb0b694a0,rsvp,hard,image,other-415
46288,1494357927141,b834995d-3bd4-4361-9b36-30ceb0b694a0,rsvp,hard,image,hard-30


## Convert timestamps into datetimes.

In [74]:
df["timestamp"] = pd.to_datetime(df["timestamp"]*1000000)

In [75]:
df.head()

Unnamed: 0,timestamp,uuid,interface,task,source,value
0,2017-04-19 20:31:54.824,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-72
1,2017-04-19 20:31:54.926,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-530
2,2017-04-19 20:31:55.027,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-575
3,2017-04-19 20:31:55.129,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-594
4,2017-04-19 20:31:55.230,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-582


In [76]:
df.dtypes

timestamp    datetime64[ns]
uuid                 object
interface            object
task                 object
source               object
value                object
dtype: object

In [77]:
# time difference since start of task in milleseconds
df["timedelta"] = (df
                   .groupby(["uuid", "interface", "task"])
                   ["timestamp"]
                   .transform(lambda x: x - x.iloc[0])
                   .apply(lambda t: t.to_pytimedelta().total_seconds())
                  )

In [78]:
df.head(n=5)

Unnamed: 0,timestamp,uuid,interface,task,source,value,timedelta
0,2017-04-19 20:31:54.824,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-72,0.0
1,2017-04-19 20:31:54.926,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-530,0.102
2,2017-04-19 20:31:55.027,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-575,0.203
3,2017-04-19 20:31:55.129,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-594,0.305
4,2017-04-19 20:31:55.230,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-582,0.406


# another step

- set uuid, interface, task as compound index? maybe not
- first plot. show distribution of time to identify images for traditional interface, easy task.

Here, let's extract data for the traditional interface and put things in a nice manner.

We want schema
uuid, task, task_index, timestamp, id, label, pred, delay

In [35]:
df = df.loc[df["interface"] == "traditional", ["uuid", "task", "source", "value", "timedelta"]]
df.head(n=1)

Unnamed: 0,uuid,task,source,value,timedelta
2880,83910ff3-b87a-45b1-b45f-0ce3a2464dc5,medium,image,medium-9,0.0


In [36]:
def f(df):
    # extract id and label from source/value
    tmp = df.ix[df["source"] == "image", "value"]
    tmp = tmp.reset_index(drop=True)
    id_ = tmp.apply(lambda g: g.split('-')[1])
    label = tmp.apply(lambda g: g.split('-')[0])
    
    # extract prediction from key/value
    prediction = df.ix[df["source"] == "key", "value"]
    prediction = prediction.reset_index(drop=True)
    
    # compute time differences by aligning source and key subsets
    time_key = df.ix[df["source"] == "key", "timedelta"].reset_index(drop=True)
    time_image = df.ix[df["source"] == "image", "timedelta"].reset_index(drop=True)
    delay = time_key - time_image
    
    # post-process to turn keypresses and tasks into 0/1
    unique_labels = label.unique()
    neg_label = "other"
    pos_label = list(filter(lambda x: x != neg_label, unique_labels))[0] # this is super elaborate :(
    def key_map(key):
        d = {"f": 0, "j": 1}
        return d[key]
    def label_map(task):
        d = {neg_label:0, pos_label:1}
        return d[task]
    label = label.apply(label_map)
    prediction = prediction.apply(key_map)
    
    return pd.DataFrame({"id" : id_, "label" : label, "prediction" : prediction, "delay" : delay})

In [37]:
df = df.groupby(["uuid", "task"]).apply(f)

In [38]:
df.index.rename("task_index", level=2, inplace=True)

In [39]:
df.head(n=3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,delay,id,label,prediction
uuid,task,task_index,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12b1d3dc-228b-4bf9-b145-639a9c4b6bee,easy,0,3.982,574,0,0.0
12b1d3dc-228b-4bf9-b145-639a9c4b6bee,easy,1,0.47,159,0,0.0
12b1d3dc-228b-4bf9-b145-639a9c4b6bee,easy,2,0.509,182,0,0.0


In [40]:
df.to_csv("04_grouped_and_tranformed.csv")

## Pre-processing the RSVP data

In [41]:
df = pd.read_csv("03_cleaned.csv", index_col=0)
df.head()

Unnamed: 0,timestamp,uuid,interface,task,source,value,timedelta
0,2017-04-19 20:31:54.824,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-72,0.0
1,2017-04-19 20:31:54.926,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-530,0.102
2,2017-04-19 20:31:55.027,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-575,0.203
3,2017-04-19 20:31:55.129,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-594,0.305
4,2017-04-19 20:31:55.230,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-582,0.406


In [43]:
# filter out rsvp events and delete interface column
cols = list(df.columns)
cols.remove("interface")
df = df.ix[df["interface"] == "rsvp", cols]

In [63]:
# find each uuid, task tuple that has at least 1 image shown. then, count the number 
# of keypresses for that uuid, task. remove all image and keypress events if the count
# is equal to or below `threshold`.
def get_good_rsvp(df, threshold):
    df1 = df.copy()
    grouped = df.ix[df["source"] == "image", :].groupby(["uuid", "task"])
    
    for name, group in grouped:
        nkeys = len(df.ix[(df["uuid"] == name[0]) 
                          & (df["task"] == name[1]) 
                          & (df["source"] == "key"), :])
        print(name)
        print("Found key events: {}".format(nkeys))
        if nkeys <= threshold:
            print("Less than threshold. Deleting.")
            # delete all rows matching this uuid and task
            mask = ~((df1["uuid"] == name[0]) & (df1["task"] == name[1]))
            df1 = df1.ix[mask, :]
            
    df1 = df1.reset_index(drop=True)
    
    return df1

In [64]:
df.head(n=3)

Unnamed: 0,timestamp,uuid,task,source,value,timedelta
0,2017-04-19 20:31:54.824,379307c1-1320-44c5-826a-194c6ae3c763,easy,image,other-72,0.0
1,2017-04-19 20:31:54.926,379307c1-1320-44c5-826a-194c6ae3c763,easy,image,other-530,0.102
2,2017-04-19 20:31:55.027,379307c1-1320-44c5-826a-194c6ae3c763,easy,image,other-575,0.203


In [77]:
df = get_good_rsvp(df, 0)

('12b1d3dc-228b-4bf9-b145-639a9c4b6bee', 'easy')
Found key events: 18
('12b1d3dc-228b-4bf9-b145-639a9c4b6bee', 'hard')
Found key events: 13
('12b1d3dc-228b-4bf9-b145-639a9c4b6bee', 'medium')
Found key events: 16
('36b75c12-970e-43fd-b129-5960c2c10675', 'easy')
Found key events: 0
Less than threshold. Deleting.
('36b75c12-970e-43fd-b129-5960c2c10675', 'hard')
Found key events: 0
Less than threshold. Deleting.
('36b75c12-970e-43fd-b129-5960c2c10675', 'medium')
Found key events: 0
Less than threshold. Deleting.
('379307c1-1320-44c5-826a-194c6ae3c763', 'easy')
Found key events: 0
Less than threshold. Deleting.
('379307c1-1320-44c5-826a-194c6ae3c763', 'hard')
Found key events: 0
Less than threshold. Deleting.
('379307c1-1320-44c5-826a-194c6ae3c763', 'medium')
Found key events: 0
Less than threshold. Deleting.
('3e040e04-6e89-4765-9079-085c51b0ad18', 'easy')
Found key events: 0
Less than threshold. Deleting.
('3e040e04-6e89-4765-9079-085c51b0ad18', 'hard')
Found key events: 0
Less than thres

In [79]:
len(df)

6355

In [80]:
df.head()

Unnamed: 0,timestamp,uuid,task,source,value,timedelta
0,2017-05-02 20:51:15.593,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,other-379,0.0
1,2017-05-02 20:51:15.695,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,easy-8,0.102
2,2017-05-02 20:51:15.802,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,other-26,0.209
3,2017-05-02 20:51:15.904,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,other-183,0.311
4,2017-05-02 20:51:16.019,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,other-530,0.426


In [81]:
# extract image shows
df_images = df.ix[df["source"] == "image",:].reset_index(drop=True)
df_images["id"] = df_images["value"].apply(lambda x: x.split('-')[1])
df_images["label"] = df_images["value"].apply(lambda x: x.split('-')[0])
del df_images["value"]
df_images.head(n=2)

Unnamed: 0,timestamp,uuid,task,source,timedelta,id,label
0,2017-05-02 20:51:15.593,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,0.0,379,other
1,2017-05-02 20:51:15.695,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,0.102,8,easy


In [82]:
# label to 0/1
def label_map(label):
    if label=="other":
        return 0
    else:
        return 1
df_images["label"] = df_images["label"].apply(label_map)

In [83]:
df_images.head(n=2)

Unnamed: 0,timestamp,uuid,task,source,timedelta,id,label
0,2017-05-02 20:51:15.593,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,0.0,379,0
1,2017-05-02 20:51:15.695,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,0.102,8,1


In [84]:
df_images.to_csv("06_RSVP_images.csv")

In [85]:
# extract key presses
df_keys = df.ix[df["source"] == "key", :].reset_index(drop=True)

In [86]:
df_keys = df_keys.ix[:, ["timestamp", "uuid", "task", "source", "timedelta"]]

In [87]:
df_keys.to_csv("06_RSVP_keys.csv")