In [None]:
import math

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import repsci
import scipy.stats as spstats

%pylab inline

matplotlib.rcParams['font.size'] = 9
matplotlib.rcParams['figure.dpi'] = 150
matplotlib.rcParams['lines.markersize'] = 9

In [None]:
exp = repsci.Experiment('comments')

In [None]:
df = pd.read_csv('../results/comments_2_3.tsv', delimiter='\t')
df = df.set_index('comment_id')
pods = sorted(set(df.pod))

In [None]:
df_pods = pd.read_csv('results/pods_2_3.tsv', delimiter='\t')
df_pods = df_pods.set_index('pod_id')

In [None]:
pod_data = {}

total = 0
treatments = {
    1: "Control",
    2: "Random-Pod"
}

for pod in pods:
    data = "Stage {} - Pod {} ({})\n\n".format(
        df_pods.stage[pod] + 1, pod, treatments[df_pods.treatment[pod]]
    )
    df_pod = df[df.pod == pod]
    top_level = df_pod[df_pod.parent_id.isna()]
    for parent in sorted(set(top_level.index)):
        total += 1
        parent_data = []
        body = top_level.body[parent].replace('<br/>', "\n")
        parent_data.append("P{}: ".format(top_level.user_id[parent]) + body)
        df_children = df_pod[df_pod.parent_id == parent]
        for child in sorted(set(df_children.index)):
            body = df_children.body[child].replace('<br/>', "\n|.   ")
            parent_data.append("|   P{}: ".format(df_children.user_id[child]) + body)
            total += 1
        data += "\n---\n".join(parent_data) + "\n===\n\n"
    pod_data[pod] = data

In [None]:
for pod in pods:
    with open("comments-{:03d}.txt".format(pod), "w") as f:
        f.write(pod_data[pod])

In [None]:
df = df.join(df_pods, on='pod')

In [None]:
df_control = df[df.treatment == 1]
df_random = df[df.treatment == 2]

In [None]:
def activity_counts(df):
    df_counts = df.groupby(['user_id', 'stage']).count()
    participants = sorted(set(df.user_id))
    stages = range(3)
    counts = []
    for stage in stages:
        stage_counts = []
        for part in participants:
            try:
                stage_counts.append(df_counts.body[(part, stage)])
            except KeyError:
                pass
        counts.append(stage_counts)
    return counts

In [None]:
control_counts = activity_counts(df_control)
random_counts = activity_counts(df_random)

plt.figure(figsize=(6,2))

for i in range(3):
    ax = plt.subplot(1,3,1 + i)
    plt.hist(
        [control_counts[i], random_counts[i]],
        bins=[0.5, 1.5, 2.5, 3.5, 4.5],
        label=['Control', 'Random-Pod'], zorder=3)
    plt.ylim([0,30])
    plt.xticks(range(1,5))
    plt.xlabel('Comment Ct.')
    plt.ylabel('Participant Ct.')
    plt.grid(axis="y")
    plt.title("Stage {}".format(i+1))
    if i == 2:
        plt.legend(fontsize=6)

plt.tight_layout()
plt.savefig(exp.get_filename('fig-comment-counts.eps'))
plt.savefig(exp.get_filename('fig-comment-counts.png'), dpi=600)

In [None]:
control_counts = activity_counts(df_control)
random_counts = activity_counts(df_random)

control_totals = [sum(stage) for stage in control_counts]
random_totals = [sum(stage) for stage in random_counts]

control_entropy = [spstats.entropy(stage) for stage in control_counts]
random_entropy = [spstats.entropy(stage) for stage in random_counts]

plt.figure(figsize=(6,3))

plt.subplot(1,2,1)
plt.plot(control_totals, '.-', label='Control')
plt.plot(random_totals, '.-', label='Random-Pod')
plt.ylim([0,55])
plt.xticks(range(3), labels=range(1,4))
plt.xlabel('Stage')
plt.ylabel('Total Comments')
plt.grid()
plt.legend()

plt.subplot(1,2,2)
plt.plot(control_entropy, '.-', label='Control')
plt.plot(random_entropy, '.-', label='Random-Pod')
plt.xticks(range(3), labels=range(1,4))
plt.xlabel('Stage')
plt.ylabel('Entropy')
plt.grid()
plt.legend()

plt.tight_layout()

plt.savefig(exp.get_filename('fig-comment-stages.eps'))
plt.savefig(exp.get_filename('fig-comment-stages.png'), dpi=600)