In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns

sns.set(font="DejaVu Sans")

In [None]:
df = pd.read_csv("./LostAtNight_task_run.csv").set_index("task_run__finish_time")
df.index = pd.to_datetime(df.index)

# Calculation of Metrics

All metrics are lifted almost directly from the paper ["How is success defined and measured in online citizen science?"][1]. Some of these metrics are a bit arbitrary and the reasons behind some of the decisions for the representation of project age, etc, are a bit arcane - bear that in mind.

[1]: https://dx.doi.org/10.1109/MCSE.2015.65

## Public Engagement

In [None]:
def project_appeal(num_users, active_period):
    return num_users / (active_period ** 2)

def public_contribution(median_cpv, active_period):
    return median_cpv / (active_period ** 2)

def sustained_engagement(median_vap, active_period):
    return median_vap / (active_period ** 2)

def gini(arr):
    """
    Calculate Gini coefficient of array.
    
    https://en.wikipedia.org/wiki/Gini_coefficient
    http://neuroplausible.com/gini
    http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    """
    arr = np.sort(np.ravel(arr))
    ind = np.arange(1, len(arr) + 1)
    return np.sum((((2 * ind) - len(arr) - 1) * arr)) / (len(arr) * np.sum(arr))

def distribution_of_effort(cpu):
    return 1 - gini(cpu)

In [None]:
# Store metrics here
metrics = {
    "years": [],
    "appeal": [],
    "contribution": [],
    "engagement": []
}

# Group data annually
years = df.groupby(by=pd.TimeGrouper("AS", closed="left"))

# Go through each year's data
user_field = "task_run__user_id"
time_field = "task_run__created"
start_date = df.sort_index().index[0]
for timestamp in sorted(years.groups.keys()):
    year = timestamp.year
    annual = df[df.index.year == year]
    
    # Calculate classifications per user
    cpu = annual.groupby(by=[user_field])[user_field].count()
    
    # Project's active period
    delta = annual.sort_index().index[-1] - start_date
    active_weeks = delta.days / 7.0
    
    # Number of users
    num_users = annual.groupby(user_field).count().shape[0]
    
    # Median classifications per user
    median_cpv = cpu.median()
    
    # Median volunteer active period
    fst = pd.to_datetime(df.groupby(by=[user_field]).first()[time_field])
    fst = fst.reset_index().set_index(user_field)

    lst = pd.to_datetime(annual.groupby(by=[user_field]).last()[time_field])
    lst = lst.reset_index().set_index(user_field)

    fst_lst = lst.join(fst, how="left", rsuffix="_fst")

    delta = fst_lst[time_field] - fst_lst[time_field + "_fst"]
    median_vap = delta.median(axis=0).total_seconds() / 604800.0
    
    # Distribution of effort
    dist_effort = distribution_of_effort(cpu)
    
    # Calculate metrics
    metrics["years"].append(year)
    metrics["appeal"].append(project_appeal(num_users, active_weeks))
    metrics["contribution"].append(public_contribution(median_cpv, active_weeks))
    metrics["engagement"].append(sustained_engagement(median_vap, active_weeks))