In [35]:
import pandas as pd
import numpy as np

import scipy.stats as st

from sklearn.preprocessing import OrdinalEncoder

In [36]:
df = pd.read_csv('randomness_12k_clean.csv')
df.head()

Unnamed: 0,subreddit,response_code
0,changemyview,302
1,Terraform,302
2,lostpause,302
3,USPS,302
4,MaliciousCompliance,302


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12055 entries, 0 to 12054
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   subreddit      12055 non-null  object
 1   response_code  12055 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 188.5+ KB


In [38]:
subs = df['subreddit'].to_numpy().reshape(-1, 1)
subs

array([['changemyview'],
       ['Terraform'],
       ['lostpause'],
       ...,
       ['HalfLife'],
       ['watch_dogs'],
       ['longboarding']], dtype=object)

In [39]:
subs_enc = OrdinalEncoder().fit_transform(subs)
subs_enc = subs_enc.ravel()
subs_enc

array([2185., 1668., 2840., ...,  820., 3602., 2832.])

In [40]:
# Runs test for randomness

# This is based on the Applications section of the Wikipedia page
# https://en.wikipedia.org/wiki/Wald%E2%80%93Wolfowitz_runs_test
#
# And this article
# "An Application of the Runs Test to Test for Randomness of Observations 
#  Obtained from a Clinical Survey in an Ordered Population"
# by Mohamad Adam Bujang and Fatin Ellisya Sapri
# Published online 2018 Aug 30. doi: 10.21315/mjms2018.25.4.15
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6422539/

subs_median = np.median(subs_enc)
subs_runs = []
for x in subs_enc:
  if x > subs_median:
    subs_runs.append(1)
  elif x < subs_median:
    subs_runs.append(0)
  # Throw out values equal to the median
  else:
    continue


In [41]:
# 0.49995851655189577
# pretty much equal probabilty
# I think that's by definition
# since we used the median
runs_mean = np.mean(subs_runs)
runs_mean

0.49995851655189577

In [42]:
# count the number of runs
total_runs = 0
for i in range(1, len(subs_runs)):
  cur = subs_runs[i]
  prev = subs_runs[i-1]
  if prev != cur:
    total_runs += 1
# Add one for the final run in the sequence b/c 
# that won't get counted this way
total_runs += 1
print('number of runs:', total_runs)

number of runs: 6043


In [43]:
# calculate the mean and variance
# of the null hypothesis distribution of total number of runs
# given the number of positive and negative results we saw
pos_N = np.sum(subs_runs)
neg_N = len(subs_runs) - pos_N
N = pos_N + neg_N
print(pos_N, neg_N, N)

H0_mean = ( (2 * pos_N * neg_N) / N ) + 1
print('H_0 mean:    ', H0_mean)

H0_var = ( (H0_mean - 1) * (H0_mean - 2) ) / (N - 1)
print('H_0 variance:', H0_var)


6026 6027 12053
H_0 mean:     6027.4999585165515
H_0 variance: 3012.9999377731065


In [44]:
H0_sigma = np.sqrt(H0_var)
diff_mean = abs(total_runs - H0_mean)
test_sigma = diff_mean / H0_sigma
print('Difference from theoretical mean is', diff_mean)
print('which is a', test_sigma, 'sigma event')


Difference from theoretical mean is 15.500041483448513
which is a 0.2823795872836179 sigma event


In [45]:
# compare with critical value
# (two-tailed)

p_value = 0.05
crit = 1 - (p_value / 2)
crit_sigma = st.norm.ppf(crit)

print('Critical value of sigma (two-tailed):', crit_sigma)
print('Test value of sigma:                 ', test_sigma)
print()

if abs(test_sigma) > crit_sigma:
  print(f'Reject the null hypothesis at the p = {p_value}')
else:
  print(f'Fail to reject the null hypothesis at the p = {p_value} level.\n')

print('Null Hypothesis: The total number of runs in the data comes from the distribution')
print('of total numbers of runs that we would expect to see in a random sequence.')


Critical value of sigma (two-tailed): 1.959963984540054
Test value of sigma:                  0.2823795872836179

Fail to reject the null hypothesis at the p = 0.05 level.

Null Hypothesis: The total number of runs in the data comes from the distribution
of total numbers of runs that we would expect to see in a random sequence.


In [46]:
# Failed to reject the null hypothesis
# So I'm going to conclude that the sequence is random