# Introduction

Notebook to analyse the PyBossa taskruns from the Expert App Version 2.

# Load Libraries and Data

In [1]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np

import modules.utils.firefox_dataset_p2 as fd

from sklearn.metrics import cohen_kappa_score

taskruns = fd.TaskRuns.read_expert_taskruns_2_df()

TaskRuns shape: (93, 11)


# Grouping Contributions By Time

In [3]:
taskruns['created'] = pd.to_datetime(taskruns['created'], yearfirst=True)
taskruns['created_month'] = taskruns.apply(lambda row: row['created'].month, axis=1)
taskruns['created_day'] = taskruns.apply(lambda row: row['created'].day, axis=1)
taskruns['created_hour'] = taskruns.apply(lambda row: row['created'].hour, axis=1)

grouped_trs = taskruns.groupby(by=['created_month','created_day','created_hour']).count()
grouped_trs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bug_id,user_id,task_id,created,finish_time,user_ip,link,timeout,project_id,id,answers
created_month,created_day,created_hour,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3,7,0,38,38,38,38,38,38,38,38,38,38,38
3,7,11,6,6,6,6,6,6,6,6,6,6,6
3,7,12,49,49,49,49,49,49,49,49,49,49,49


# Count TaskRuns by Task

In [4]:
cols = ['task_id','id']

df = taskruns[cols].groupby(by='task_id').count()
df.rename(columns={"id" :'count_trs'}, inplace=True)
display(df.tail(10))
print(df.shape)

Unnamed: 0_level_0,count_trs
task_id,Unnamed: 1_level_1
2011,1
2012,1
2013,1
2014,1
2015,1
2016,1
2017,1
2018,1
2019,1
2020,1


(93, 1)


# Check All Tasks Have At Least One Answer

In [5]:
print(set(df.index) - set(range(1928,2020+1,1)))

set()


# Analysis of Taskruns Infos

In [6]:
taskruns[['bug_id','answers']].head(10)

Unnamed: 0,bug_id,answers
0,1181835,0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1,1248267,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2,1248268,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3,1257087,1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4,1264988,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5,1267480,0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
6,1267501,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
7,1269348,0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
8,1269485,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
9,1270274,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


In [7]:
taskruns.dtypes

bug_id                    int64
user_id                   int64
task_id                   int64
created          datetime64[ns]
finish_time              object
user_ip                  object
link                     object
timeout                  object
project_id                int64
id                        int64
answers                  object
created_month             int64
created_day               int64
created_hour              int64
dtype: object

# Compare Order of Taskruns

The order of the taskruns must be the same, ordered by the finish time or by the bug_id fields.

In [8]:
taskruns_ordered_by_finish_time = taskruns.sort_values(by='finish_time')
taskruns_ordered_by_bug_id = taskruns.copy()  ## already ordered by bug_id

diffs = taskruns_ordered_by_finish_time.bug_id - taskruns_ordered_by_bug_id.bug_id
d = np.sum(diffs)

assert d == 0