In [8]:
import numpy as np

Use a gamma/Poisson Bayesian model since gamma is a conjugate prior for Poisson

In [6]:
def discussion_score(threads, alpha=1, beta=2):
    # alpha and beta are priors for the gamma prior
    # see below for more info
    
    X = np.array([t.length for t in threads])
    n = len(X)

    # this is the posterior mean (i.e. the expected lambda parameter for the poisson)
    # which is also the expected value for the poisson distribution itself
    # since the gamma distribution is a conjugate prior for the poisson,
    # we get this mean analytically
    return ((n/(n + beta)) * (np.sum(X)/n)) + (beta/(n+beta) * (alpha/beta))


In [81]:
class Thread():
    # dummy class to represent threads
    def __init__(self, length):
        self.length = length

In [67]:
# small comments section, with mostly single comments, but one longer thread
threads = [Thread(10), Thread(1), Thread(1), Thread(1)]
score_1 = discussion_score(threads)
print('score 1:', score_1)

# a longer comments section, with mostly single comments, but one longer thread
threads = [Thread(10), Thread(1), Thread(1), Thread(1), Thread(1), Thread(1), Thread(1)]
score_2 = discussion_score(threads)
print('score 2:', score_2)

# a larger sample size with more short threads should lower the score
assert(score_2 < score_1)

# more longer threads should have a higher score
threads = [Thread(10), Thread(10), Thread(10), Thread(1)]
score_3 = discussion_score(threads)
print('score 3:', score_3)
assert(score_3 > score_1)

# only longer threads should have an even higher score
threads = [Thread(10), Thread(10), Thread(10), Thread(10)]
score_4 = discussion_score(threads)
print('score 4:', score_4)
assert(score_4 > score_3)

# more threads should make us more confident in the score
threads = [Thread(10), Thread(10), Thread(10), Thread(10), Thread(10), Thread(10), Thread(10)]
score_5 = discussion_score(threads)
print('score 5:', score_5)
assert(score_5 > score_4)

# more threads should make us more confident in the score
# lower beta should give higher score (see below for more info)
threads = [Thread(10), Thread(10), Thread(10), Thread(10), Thread(10), Thread(10), Thread(10), Thread(10), Thread(10), Thread(10)]
score_6 = discussion_score(threads, beta=1)
print('score 6:', score_6)
assert(score_6 > score_5)

score 1: 2.33333333333
score 2: 1.88888888889
score 3: 5.33333333333
score 4: 6.83333333333
score 5: 7.88888888889
score 6: 9.18181818182


In [11]:
import math

class Thread():
    # dummy class to represent threads
    def __init__(self, length, participants):
        self.length = length
        self.participants = participants

# need to also take into account number of particiants - a better discussion is one with more people involved
threads = [Thread(10, 2), Thread(10, 10)]
mean_participant_ratio = np.mean([t.participants/t.length for t in threads])
score = discussion_score(threads)
score_1 = math.sqrt(mean_participant_ratio) * score
print('score 1', score_1)

threads = [Thread(10, 10), Thread(10, 10)]
mean_participant_ratio = np.mean([t.participants/t.length for t in threads])
score = discussion_score(threads)
score_2 = math.sqrt(mean_participant_ratio) * score
print('score 2', score_2)

# threads of same length, but with more participants, should have a higher score
assert(score_2 > score_1)

score 1 4.06663251352
score 2 5.25


We use the gamma distribution to give us a value which we use as $\lambda$, which is the parameter (and mean aka expected value) for the poisson distribution we assume describes thread counts in a comments section. We use the expected value from the gamma distribution as $\lambda$.

In this context, $\lambda$ is a prediction for thread length given a comments section. That is, it tells us: if a new thread is started in this comments section, how long do we expect it to be? We expect it to be longer if an article is generating more discussion.

The expected value for a gamma distribution is just $\alpha \beta$.

So if we set $\alpha = 1, \beta = 2$, then the prior value for $\lambda = \alpha \beta = 2$, which means that by default, in an empty comments section, we expect the first thread to have at least two comments in it. We can scale this back and more conservatively estimate that the first thread will have only one comment by setting $\alpha = 1, \beta = 1$, so that the prior $\lambda = 1$.

In [74]:
# just to computationally demonstrate that alpha * beta is indeed the expected value for the gamma distribution

alpha = 1
beta = 2

# expected gamma value is alpha * beta
expected_1 = alpha * beta

# can also compute the expected value by simulation
samples = np.random.gamma(alpha, beta, size=100000)
expected_2 = np.mean(samples)

print(expected_1)
print(expected_2)

# these two should be very close
assert(abs(expected_1 - expected_2) < 0.01)

2
2.00539740578


In [80]:
# doing the same for poisson
lmbda = 4

expected_1 = lmbda
samples = np.random.poisson(lmbda, size=100000)
expected_2 = np.mean(samples)

print(expected_1)
print(expected_2)

# these two should be very close
assert(abs(expected_1 - expected_2) < 0.01)

4
4.0004


Trying it with cython~

In [2]:
%load_ext Cython

In [3]:
%%cython

import numpy as np
cimport numpy as np

def cy_discussion_score(np.ndarray threads, float alpha=1, float beta=2):
    cdef int n = len(threads)
    return ((n/(n + beta)) * (np.sum(threads)/n)) + (beta/(n+beta) * (alpha/beta))

In [4]:
def py_discussion_score(threads, alpha=1, beta=2):
    n = len(threads)
    return ((n/(n + beta)) * (np.sum(threads)/n)) + (beta/(n+beta) * (alpha/beta))

In [5]:
%timeit cy_discussion_score(np.array([1,2,4,4,4,4,4,4,4]), 1, 2)
%timeit py_discussion_score(np.array([1,2,4,4,4,4,4,4,4]), 1, 2)

The slowest run took 14.94 times longer than the fastest. This could mean that an intermediate result is being cached 
100000 loops, best of 3: 7.3 µs per loop
The slowest run took 5.33 times longer than the fastest. This could mean that an intermediate result is being cached 
100000 loops, best of 3: 7.65 µs per loop


---

Maybe rather than thread length, it is better to try and predict the number of unique participants a thread will have. Can use the same function as above, just assume that `Thread.length` is the number of unique participants instead.

Could also use the original thread length score but combine it with "what is the probability that the next speaker in a thread is a new participant?"

In [28]:
def differing_dyads(thread):
    # count the number of dyads with two different participants
    # and count the number of total dyads
    n_pairs = 0
    n_diff_participant = 0

    for a, b in zip(thread, thread[1:]):
        n_pairs += 1
        if a != b:
            n_diff_participant += 1

    return n_diff_participant, n_pairs

# A thread with participant ids:
thread = [0,1,0,1,2,1,1,1]
diff, total = differing_dyads(thread)
print(diff/total)

thread = [0,0,0,0,0,0,0,0]
diff, total = differing_dyads(thread)
print(diff/total)

0.7142857142857143
0.0


In [29]:
# TO DO build a model around the above