In [26]:
import csv
import pandas as pd
import numpy as np
from scipy.stats import binom, norm
import statistics as st

In [32]:
def sign_test(x, y = None, md = 0, alternative = "two_sided"):

  # One sample sign test
  if y is None:
    
    x_minus_md = [(i - md) for i in x]
    x_md_clean = [j for j in x_minus_md if j != 0]  # remove the zeroes
    n_adj = len(x_md_clean)  # sample size after removing the zeroes
    pos_count = len([k for k in x_md_clean if k > 0])

    med_count = st.median(np.arange(0, n_adj + 1))
    
    if (alternative == "two_sided") and (pos_count >= med_count):
      p_val = 2 * (1 - binom.cdf((pos_count - 1), n_adj, 0.5))
    elif pos_count < med_count:
      p_val = 2 * binom.cdf(pos_count, n_adj, 0.5)
  
    if alternative == "greater":
      p_val = 1 - binom.cdf(pos_count, n_adj, 0.5)
    elif alternative == "less":
      p_val = binom.cdf(pos_count, n_adj, 0.5)
      
  # Paired sign test
  elif y is not None:
    
    xx = np.array(x)
    yy = np.array(y)
    diff = xx - yy
    diff_list = diff.tolist()
    diff_clean = [j for j in diff_list if j != 0]
    n_adj = len(diff_clean)
    pos_count = len([k for k in diff_clean if k > 0])

    med_count = st.median(np.arange(0, n_adj + 1))

    if (alternative == "two_sided") and (pos_count >= med_count):
      p_val = 2 * (1 - binom.cdf((pos_count - 1), n_adj, 0.5))
    elif pos_count < med_count:
      p_val = 2 * binom.cdf(pos_count, n_adj, 0.5)

    if alternative == "greater":
      p_val = 1 - binom.cdf(pos_count, n_adj, 0.5)
    elif alternative == "less":
      p_val = binom.cdf(pos_count, n_adj, 0.5)

  print(med_count) 
  print(f"Number of positives: {pos_count} out of a sample size of {n_adj}")
  return(print(f"P_value: {p_val}"))


In [4]:
url = "/content/drive/MyDrive/Resources/Pine_stand.csv"
df = pd.read_csv(url)

In [5]:
# Test data for one sample sign test
lob_dbh = df[df["Species"] == "Loblolly pine"].dbh.to_list()
slash_dbh = df[df["Species"] == "Slash pine"].dbh.to_list()


In [6]:
# Actual medians for testing
print(df[df["Species"] == "Loblolly pine"].dbh.median())
print(df[df["Species"] == "Slash pine"].dbh.median())


45.0
34.0


In [33]:
# Test data for paired sign test
test1 = norm.rvs(size = 10, loc = 5, scale = 1)
test2 = norm.rvs(size = 10, loc = 5, scale = 1)
test3 = norm.rvs(size = 10, loc = 7, scale = 1)

In [34]:
sign_test(lob_dbh, md = 45)

40.5
Number of positives: 39 out of a sample size of 81
P_value: 0.8243132151052075


In [35]:
sign_test(lob_dbh, md = 30)

41.5
Number of positives: 72 out of a sample size of 83
P_value: 3.9181990985071025e-12


In [31]:
2* (1 - binom.cdf(72, 83, 0.5))

5.799805080641818e-13

In [20]:
sign_test(x = test1, y = test2)

trigger1
5
Number of positives: 6 out of a sample size of 10
P_value: 0.34375


In [21]:
sign_test(x = test1, y = test2, alternative = "greater")

5
Number of positives: 6 out of a sample size of 10
P_value: 0.171875


In [22]:
sign_test(x = test1, y = test2, alternative = "less")

5
Number of positives: 6 out of a sample size of 10
P_value: 0.828125


In [25]:
# note potentially low power in this case
sign_test(x = test1, y = test3)

trigger2
5
Number of positives: 1 out of a sample size of 10
P_value: 0.021484374999999997
