In [1]:
%autosave 0
import pandas as pd
import numpy as np
import sklearn.preprocessing
#import seaborn as sns
#import matplotlib.pyplot as plt
#%matplotlib inline
#sns.set(style="white", palette="muted", color_codes=True)
pd.set_option("display.max_rows", 100)

Autosave disabled


## Import data

In [95]:
filename_raw = "data-2017-05-11-0947.csv"
df = pd.read_csv(filename_raw, low_memory=True, header=0)
df["event_id"] = np.arange(0, len(df))
display(df.head(n=2))
display(df.tail(n=2))

Unnamed: 0,id,interface,source,task,timestamp,uuid,value,event_id
0,other-72,rsvp,image,easy,1492633914824,379307c1-1320-44c5-826a-194c6ae3c763,,0
1,other-530,rsvp,image,easy,1492633914926,379307c1-1320-44c5-826a-194c6ae3c763,,1


Unnamed: 0,id,interface,source,task,timestamp,uuid,value,event_id
46287,other-415,rsvp,image,hard,1494357927141,b834995d-3bd4-4361-9b36-30ceb0b694a0,,46287
46288,hard-30,rsvp,image,hard,1494357927141,b834995d-3bd4-4361-9b36-30ceb0b694a0,,46288


In [96]:
len(df["uuid"].unique())

27

## Preprocessing

- quickly map uuids into human-readable names
- reorder columns:
```
timetstamp, uuid, interface, task, source, id, value
```

- later, process columns to combine the `id` and `value` columns.
- filter out observations from when the interface had a data collection issue.
- filter out users that did not complete all tasks

In [97]:
names = ["Alice", "Bob", "Charlie", "Dana", "Ellen", "Fred",
         "Gary", "Henry", "Irene", "Joseph", "Kenneth", "Louise",
         "Micah", "Nancy", "Odin", "Patty", "Quentin", "Randall",
         "Samantha", "Tracy", "Umberto", "Yolanda", "Zed",
         "Steve", "Brenda", "Christine", "Rachel", "Jon"]
import numpy as np
uuids = df["uuid"].unique()
uuids_unique, uuids_indices = np.unique(uuids, return_inverse=True)
d = {u: names[uuids_indices[i]] for i, u in enumerate(uuids_unique)}
df["uuid"] = df["uuid"].map(d)

In [98]:
def reorder_columns(df):
    cols = ["timestamp", "uuid", "interface", "task", "source", "id", "value", "event_id"]
    df = df[cols]
    return df
df = reorder_columns(df)

## Exploration

Summary of how many images people saw, by user, interface, and task.

In [99]:
(df
 .query("source == 'image'")
 .groupby(["uuid", "interface", "task"])
 .count())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,timestamp,source,id,value
uuid,interface,task,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alice,rsvp,easy,240,240,240,0
Alice,rsvp,hard,240,240,240,0
Alice,rsvp,medium,240,240,240,0
Alice,traditional,easy,128,128,128,0
Alice,traditional,hard,240,240,240,0
Alice,traditional,medium,240,240,240,0
Bob,rsvp,easy,240,240,240,0
Bob,rsvp,hard,240,240,240,0
Bob,rsvp,medium,240,240,240,0
Brenda,rsvp,easy,240,240,240,0


Number of users who saw n=240 images for each interface/task combination.

In [100]:
n_images = 240
n_interfaces = 2
n_tasks = 3
n = n_images * n_interfaces * n_tasks

In [101]:
def set_of_users_who_saw_tasks(symbol):
    return (df
            .query("source == 'image'")
            .groupby("uuid")
            .count()
            .query("id {symbol} @n".format(symbol=symbol))
            [["id"]]
            .rename(columns={"id":"count"}))

In [102]:
# Set of users who saw exactly n=240 for each interface/task
set_of_users_who_saw_tasks('==')

Unnamed: 0_level_0,count
uuid,Unnamed: 1_level_1
Brenda,1440
Charlie,1440
Christine,1440
Dana,1440
Henry,1440
Odin,1440
Quentin,1440
Randall,1440
Samantha,1440


In [103]:
# Set of users who saw *less than* n=240 for each interface/task
set_of_users_who_saw_tasks('<')

Unnamed: 0_level_0,count
uuid,Unnamed: 1_level_1
Alice,1328
Bob,720
Ellen,1328
Fred,720
Gary,240
Irene,720
Joseph,1200
Kenneth,1328
Louise,1328
Micah,720


In [104]:
# Set of users who saw *more than* n=240 for each interface/task
set_of_users_who_saw_tasks('>')

Unnamed: 0_level_0,count
uuid,Unnamed: 1_level_1
Rachel,1610
Yolanda,1715


In [105]:
# We can see that there are two uuids, both with less than n tasks, that add up to n.
# Were they the same user, with a refreshed browser perhaps?
# Verdict: Unfortunately, both uuids are just associated with rsvp.
ids = map(lambda k: d[k], ["36b75c12-970e-43fd-b129-5960c2c10675", "cca6cbfb-7bd1-4307-bfea-8d8c212cfceb"])
(df
 .loc[(df["source"] == "image") & (df["uuid"].isin(ids)), :]
 .groupby(["uuid", "interface", "task"])
 .count()
 [["id"]]
 .rename(columns={"id":"count"}))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
uuid,interface,task,Unnamed: 3_level_1
Micah,rsvp,easy,240
Micah,rsvp,hard,240
Micah,rsvp,medium,240
Steve,rsvp,easy,240
Steve,rsvp,hard,240
Steve,rsvp,medium,240


## Tranform to combine id and value columns

In [106]:
df = df.copy()
inds = df["id"].notnull()
df.loc[inds, "value"] = df.loc[inds, "id"]
del df["id"]

## Convert timestamps into datetimes.

In [107]:
df["timestamp"] = pd.to_datetime(df["timestamp"]*1000000)

In [108]:
df.tail(n=3)

Unnamed: 0,timestamp,uuid,interface,task,source,value
46286,2017-05-09 19:25:27.141,Rachel,rsvp,hard,image,other-284
46287,2017-05-09 19:25:27.141,Rachel,rsvp,hard,image,other-415
46288,2017-05-09 19:25:27.141,Rachel,rsvp,hard,image,hard-30


In [111]:
# time difference since start of task in milleseconds
df["timedelta"] = (df
                   .groupby(["uuid", "interface", "task"])
                   ["timestamp"]
                   .transform(lambda x: x - x.iloc[0])
                   .apply(lambda t: t.to_pytimedelta().total_seconds())
                  )

### Some brief exploration, while the data is still "raw"

In [121]:
n=999
with pd.option_context("display.max_rows", n):
    display(df.sort_values(by="timestamp").query("uuid == 'Charlie' & interface == 'rsvp'"))

Unnamed: 0,timestamp,uuid,interface,task,source,value,timedelta
37035,2017-05-03 16:34:58.680,Charlie,rsvp,medium,image,other-148,0.0
37036,2017-05-03 16:34:58.781,Charlie,rsvp,medium,image,medium-1,0.101
37037,2017-05-03 16:34:58.883,Charlie,rsvp,medium,image,other-152,0.203
37038,2017-05-03 16:34:58.984,Charlie,rsvp,medium,image,other-60,0.304
37039,2017-05-03 16:34:59.084,Charlie,rsvp,medium,image,other-375,0.404
37040,2017-05-03 16:34:59.186,Charlie,rsvp,medium,image,other-59,0.506
37041,2017-05-03 16:34:59.287,Charlie,rsvp,medium,image,other-92,0.607
37042,2017-05-03 16:34:59.387,Charlie,rsvp,medium,key,j,0.707
37043,2017-05-03 16:34:59.390,Charlie,rsvp,medium,image,other-122,0.71
37044,2017-05-03 16:34:59.492,Charlie,rsvp,medium,image,other-253,0.812


In [126]:
(df
 .query('interface == "rsvp"')
 .groupby(["uuid", "task"]).apply(lambda _df: _df["source"].value_counts())
 .unstack())

Unnamed: 0_level_0,Unnamed: 1_level_0,image,key
uuid,task,Unnamed: 2_level_1,Unnamed: 3_level_1
Alice,easy,240.0,
Alice,hard,240.0,
Alice,medium,240.0,
Bob,easy,240.0,
Bob,hard,240.0,
Bob,medium,240.0,
Brenda,easy,240.0,
Brenda,hard,240.0,
Brenda,medium,240.0,
Charlie,easy,240.0,18.0


In [None]:
(df
 .query('interface == "rsvp"')
 .groupby(["uuid", "task"]).apply(lambda _df: _df["source"].value_counts())
 .unstack())

## Change schema

Here, let's extract data for the traditional interface and put things in a nice manner.

We want schema
```
uuid, task, task_index, id, true_label, pred_label, delay
```

In [65]:
df = (df
      .query("interface == 'traditional'")
      .drop(["interface", "timestamp"], axis=1))

In [66]:
df.head(n=1)

Unnamed: 0,uuid,task,source,value,timedelta
2880,Patty,medium,image,medium-9,0.0


In [67]:
def process_task(df):
    image_mask = df["source"] == "image"
    key_mask   = df["source"] == "key"

    # image id as given by image name
    id_ser = (df
              .loc[image_mask, "value"]
              .apply(lambda x: x.split("-")[1]))

    # true label as given by image name
    true_label_ser = (df
                      .loc[image_mask, "value"]
                      .apply(lambda x: x.split("-")[0])
                      .map(lambda x: 0 if x=="other" else 1))

    # predicted label as identified from key press events
    pred_label_ser = (df
                      .loc[key_mask, "value"]
                      .map({"f": 0, "j": 1}))

    # align timedelta from key events and from show events and subtract
    # TODO double check this part; consider joining on "value", then subtracting with axis=1
    delay_ser = (df.loc[key_mask, "timedelta"].reset_index(drop=True) -
                 df.loc[image_mask, "timedelta"].reset_index(drop=True))

    return pd.DataFrame({"id": id_ser,
                         "true_label": true_label_ser,
                         "pred_label": pred_label_ser,
                         "delay": delay_ser})

In [68]:
df = (df
      .groupby(["uuid", "task"])
      .apply(process_task))
df.head(n=2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,delay,id,pred_label,true_label
uuid,task,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alice,easy,0,0.377,,,
Alice,easy,1,0.367,,,


In [None]:
df.index = df.index.rename("task_index", level=2)

In [None]:
df.head(n=3)

## Pre-processing the RSVP data

In [41]:
df = pd.read_csv("03_cleaned.csv", index_col=0)
df.head()

Unnamed: 0,timestamp,uuid,interface,task,source,value,timedelta
0,2017-04-19 20:31:54.824,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-72,0.0
1,2017-04-19 20:31:54.926,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-530,0.102
2,2017-04-19 20:31:55.027,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-575,0.203
3,2017-04-19 20:31:55.129,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-594,0.305
4,2017-04-19 20:31:55.230,379307c1-1320-44c5-826a-194c6ae3c763,rsvp,easy,image,other-582,0.406


In [43]:
# filter out rsvp events and delete interface column
cols = list(df.columns)
cols.remove("interface")
df = df.ix[df["interface"] == "rsvp", cols]

In [63]:
# find each uuid, task tuple that has at least 1 image shown. then, count the number 
# of keypresses for that uuid, task. remove all image and keypress events if the count
# is equal to or below `threshold`.
def get_good_rsvp(df, threshold):
    df1 = df.copy()
    grouped = df.ix[df["source"] == "image", :].groupby(["uuid", "task"])
    
    for name, group in grouped:
        nkeys = len(df.ix[(df["uuid"] == name[0]) 
                          & (df["task"] == name[1]) 
                          & (df["source"] == "key"), :])
        print(name)
        print("Found key events: {}".format(nkeys))
        if nkeys <= threshold:
            print("Less than threshold. Deleting.")
            # delete all rows matching this uuid and task
            mask = ~((df1["uuid"] == name[0]) & (df1["task"] == name[1]))
            df1 = df1.ix[mask, :]
            
    df1 = df1.reset_index(drop=True)
    
    return df1

In [64]:
df.head(n=3)

Unnamed: 0,timestamp,uuid,task,source,value,timedelta
0,2017-04-19 20:31:54.824,379307c1-1320-44c5-826a-194c6ae3c763,easy,image,other-72,0.0
1,2017-04-19 20:31:54.926,379307c1-1320-44c5-826a-194c6ae3c763,easy,image,other-530,0.102
2,2017-04-19 20:31:55.027,379307c1-1320-44c5-826a-194c6ae3c763,easy,image,other-575,0.203


In [77]:
df = get_good_rsvp(df, 0)

('12b1d3dc-228b-4bf9-b145-639a9c4b6bee', 'easy')
Found key events: 18
('12b1d3dc-228b-4bf9-b145-639a9c4b6bee', 'hard')
Found key events: 13
('12b1d3dc-228b-4bf9-b145-639a9c4b6bee', 'medium')
Found key events: 16
('36b75c12-970e-43fd-b129-5960c2c10675', 'easy')
Found key events: 0
Less than threshold. Deleting.
('36b75c12-970e-43fd-b129-5960c2c10675', 'hard')
Found key events: 0
Less than threshold. Deleting.
('36b75c12-970e-43fd-b129-5960c2c10675', 'medium')
Found key events: 0
Less than threshold. Deleting.
('379307c1-1320-44c5-826a-194c6ae3c763', 'easy')
Found key events: 0
Less than threshold. Deleting.
('379307c1-1320-44c5-826a-194c6ae3c763', 'hard')
Found key events: 0
Less than threshold. Deleting.
('379307c1-1320-44c5-826a-194c6ae3c763', 'medium')
Found key events: 0
Less than threshold. Deleting.
('3e040e04-6e89-4765-9079-085c51b0ad18', 'easy')
Found key events: 0
Less than threshold. Deleting.
('3e040e04-6e89-4765-9079-085c51b0ad18', 'hard')
Found key events: 0
Less than thres

In [79]:
len(df)

6355

In [80]:
df.head()

Unnamed: 0,timestamp,uuid,task,source,value,timedelta
0,2017-05-02 20:51:15.593,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,other-379,0.0
1,2017-05-02 20:51:15.695,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,easy-8,0.102
2,2017-05-02 20:51:15.802,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,other-26,0.209
3,2017-05-02 20:51:15.904,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,other-183,0.311
4,2017-05-02 20:51:16.019,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,other-530,0.426


In [81]:
# extract image shows
df_images = df.ix[df["source"] == "image",:].reset_index(drop=True)
df_images["id"] = df_images["value"].apply(lambda x: x.split('-')[1])
df_images["label"] = df_images["value"].apply(lambda x: x.split('-')[0])
del df_images["value"]
df_images.head(n=2)

Unnamed: 0,timestamp,uuid,task,source,timedelta,id,label
0,2017-05-02 20:51:15.593,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,0.0,379,other
1,2017-05-02 20:51:15.695,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,0.102,8,easy


In [82]:
# label to 0/1
def label_map(label):
    if label=="other":
        return 0
    else:
        return 1
df_images["label"] = df_images["label"].apply(label_map)

In [83]:
df_images.head(n=2)

Unnamed: 0,timestamp,uuid,task,source,timedelta,id,label
0,2017-05-02 20:51:15.593,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,0.0,379,0
1,2017-05-02 20:51:15.695,5fb56330-3c5f-4243-924c-e5457789918b,easy,image,0.102,8,1


In [84]:
df_images.to_csv("06_RSVP_images.csv")

In [85]:
# extract key presses
df_keys = df.ix[df["source"] == "key", :].reset_index(drop=True)

In [86]:
df_keys = df_keys.ix[:, ["timestamp", "uuid", "task", "source", "timedelta"]]

In [87]:
df_keys.to_csv("06_RSVP_keys.csv")