### Imports

In [None]:
import os
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from google.colab import drive, files

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# point to the drive directory
endo_dir = '/content/drive/MyDrive/endometriosis/'

In [None]:
# load file with personas predictions
personas_file = os.path.join(endo_dir, 'output', 'predictions', 'PERSONAS', 'personas_predictions.pkl')
personas = pd.read_pickle(personas_file)

# group by where the paragraph is from
personas['og_id'] = ['_'.join(x.split('_')[:3]) for x in personas.id]

#select only those in from posts
personas_posts = personas.loc[personas['type'] == 'post']

#group by post id
personas_posts = personas_posts.groupby(['og_id'], sort=False).sum().reset_index().rename(columns = {"og_id":"id"}).drop(columns=['created_utc'])
print(len(personas_posts))
personas_posts[:1]

34522


Unnamed: 0,id,predictions_DOCTORS,predictions_FAMILY,predictions_ENDO SUPPORT COMMUNITY,predictions_PARTNER
0,Endo_otb0m_post,0,0,5,0


In [None]:
#Load topic modeling file
topic_model_file = os.path.join(endo_dir, 'output', 'topic-modeling', 'parags', 'endo+endometriosis-25_10.pkl')
tomo_df = pd.read_pickle(topic_model_file).reset_index().rename(columns = {"index":"id"}) # change index into column

# add column with info about where the paragraph comes from, whether post or comment
tomo_df['og_id'] = ['_'.join(x.split('_')[:3]) for x in tomo_df.id]
tomo_df['type'] = [x.split('_')[2] for x in tomo_df.id]

# select only those from posts
posts_tomo_df = tomo_df.loc[tomo_df['type'] == 'post']

# group paragraphs' distributions by the post/comment the paragraph comes from, taking the average of the distributions
posts_tomo_df = posts_tomo_df.groupby(['og_id'], sort=False).mean().reset_index().rename(columns = {"og_id":"id"}).drop(columns=['dominant_topic'])

print(len(posts_tomo_df))
posts_tomo_df[:1]

34190


Unnamed: 0,id,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,...,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20,Topic 21,Topic 22,Topic 23,Topic 24
0,Endo_otb0m_post,0.002148,0.004427,0.084569,0.002483,0.004032,0.005094,0.198672,0.051963,0.00399,...,0.002933,0.457867,0.003799,0.00451,0.002562,0.034702,0.005111,0.002024,0.003314,0.005033


In [None]:
intent_file = os.path.join(endo_dir, 'output', 'predictions', 'INTENT', 'intent_predictions.pkl')
intent = pd.read_pickle(intent_file)
print(len(intent))
intent[:1]

34715


Unnamed: 0,author,id,text,type,url,link_id,parent_id,subreddit,created_utc,time,predictions_SEEKING_EXPERIENCES,predictions_SEEKING_INFO,predictions_SEEKING_EMOTION,predictions_VENT,predictions_PROVIDING_EXPERIENCES
7,theonusta,Endo_otb0m_post,We've been a community of Endometriosis suppor...,post,http://www.reddit.com/r/Endo/comments/otb0m/up...,,,Endo,1327349669,2012-01-23 20:14:29,0,0,0,0,0


### Transform topic distributions to z-scores and add to the predictions

In [None]:
posts_zscores = posts_tomo_df.copy()
cols = list(posts_zscores.columns[1:26])
# get z scores for each topic column
for col in cols:
  posts_zscores[col] = stats.zscore(posts_zscores[col])
posts_zscores[:1]

Unnamed: 0,id,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,...,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20,Topic 21,Topic 22,Topic 23,Topic 24
0,Endo_otb0m_post,-0.23168,-0.387125,0.468209,-0.252115,-0.372082,-0.5427,2.547604,-0.02203,-0.346507,...,-0.243485,3.552453,-0.38399,-0.439752,-0.217652,-0.396432,-0.541009,-0.206036,-0.329309,-0.352841


In [None]:
big_df = personas_posts.merge(posts_zscores, how = "right", left_on = "id", right_on = "id")
print('Length df after merge:', len(big_df))
big_df = intent[['id','predictions_SEEKING_EXPERIENCES','predictions_SEEKING_INFO', 'predictions_SEEKING_EMOTION', 'predictions_VENT']].merge(big_df, how = "right", left_on = "id", right_on = "id")
print('Length df after merge:', len(big_df))
big_df[:1]

Length df after merge: 34190
Length df after merge: 34190


Unnamed: 0,id,predictions_SEEKING_EXPERIENCES,predictions_SEEKING_INFO,predictions_SEEKING_EMOTION,predictions_VENT,predictions_DOCTORS,predictions_FAMILY,predictions_ENDO SUPPORT COMMUNITY,predictions_PARTNER,Topic 0,...,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20,Topic 21,Topic 22,Topic 23,Topic 24
0,Endo_otb0m_post,0,0,0,0,0,0,5,0,-0.23168,...,-0.243485,3.552453,-0.38399,-0.439752,-0.217652,-0.396432,-0.541009,-0.206036,-0.329309,-0.352841


## Permutation test 1

In [None]:
def permute(input_array):
  # shuffle is inplace, so copy to preserve input
  permuted = input_array.copy().values  # convert to numpy array, avoiding warning
  np.random.shuffle(permuted)
  return permuted  # convert back to pandas

def permutation_test(ddf, topic, persona, intent):
  col_1 = f'predictions_{persona}' # persona column
  col_2 = f'predictions_{intent}' # intent columns
  obs_cond = (ddf[col_1] > 0) & (ddf[col_2] == 1) # select posts where there is at least 1 mention of the persona and with the intent
  # Difference between the mean of the values in the first half and the mean of the values in the second half of the corpus
  obs_mean = ddf.loc[obs_cond][topic].mean() # take the mean topic value in selected posts
  print(f'Observed average probability: {obs_mean}')

  # Performing 10,000 permutations
  for col in [col_1, col_2]: # we select the persona or intent column
    print(f'Shuffling: {col}' )
    n_permutations = 10000 
    flag = 0
    for i in range(n_permutations): # for each of the 10,000 times
      copy = ddf.copy()  # we copy the original dataframe with the observed data
      copy[col] = permute(copy[col]) # we shuffle the selected column (persona or intent)
      perm_cond = (copy[col_1] > 0) & (copy[col_2] == 1)
      perm_mean = copy.loc[perm_cond][topic].mean()
      if perm_mean >= obs_mean:  # we test if the observed topic probability is lower
          flag += 1  # we keep count of the number of times the observed difference is larger
          # perm.append(perm_mean)
    p = flag/n_permutations
    print(f'Number of times average is larger than permutated: {flag}')
    print(f'P-value: {p}\n')

  #return perm

### Empathy

In [None]:
permutation_test(big_df, 'Topic 16', 'PARTNER','SEEKING_EMOTION')

Observed average probability: 0.9121054410934448
Shuffling: predictions_PARTNER
Number of times average is larger than permutated: 0
P-value: 0.0

Shuffling: predictions_SEEKING_EMOTION
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test(big_df, 'Topic 16', 'FAMILY','SEEKING_EMOTION')

Observed average probability: 0.6173312067985535
Shuffling: predictions_FAMILY
Number of times average is larger than permutated: 35
P-value: 0.0035

Shuffling: predictions_SEEKING_EMOTION
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test(big_df, 'Topic 16', 'ENDO SUPPORT COMMUNITY','SEEKING_EMOTION')

Observed average probability: 0.740861713886261
Shuffling: predictions_ENDO SUPPORT COMMUNITY
Number of times average is larger than permutated: 0
P-value: 0.0

Shuffling: predictions_SEEKING_EMOTION
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test(big_df, 'Topic 16', 'DOCTORS','SEEKING_EMOTION')

Observed average probability: 0.30777016282081604
Shuffling: predictions_DOCTORS
Number of times average is larger than permutated: 10000
P-value: 1.0

Shuffling: predictions_SEEKING_EMOTION
Number of times average is larger than permutated: 0
P-value: 0.0



### Fertility

In [None]:
permutation_test(big_df, 'Topic 12', 'PARTNER','SEEKING_INFO')

Observed average probability: 0.6289975643157959
Shuffling: predictions_PARTNER
Number of times average is larger than permutated: 0
P-value: 0.0

Shuffling: predictions_SEEKING_INFO
Number of times average is larger than permutated: 11
P-value: 0.0011



In [None]:
permutation_test(big_df, 'Topic 12', 'FAMILY','SEEKING_INFO')

Observed average probability: 0.5246304273605347
Shuffling: predictions_FAMILY
Number of times average is larger than permutated: 0
P-value: 0.0

Shuffling: predictions_SEEKING_INFO
Number of times average is larger than permutated: 8876
P-value: 0.8876



In [None]:
permutation_test(big_df, 'Topic 12', 'PARTNER','SEEKING_EXPERIENCES')

Observed average probability: 0.5774689316749573
Shuffling: predictions_PARTNER
Number of times average is larger than permutated: 0
P-value: 0.0

Shuffling: predictions_SEEKING_EXPERIENCES
Number of times average is larger than permutated: 34
P-value: 0.0034



In [None]:
permutation_test(big_df, 'Topic 12', 'FAMILY','SEEKING_EXPERIENCES')

Observed average probability: 0.6955799460411072
Shuffling: predictions_FAMILY
Number of times average is larger than permutated: 0
P-value: 0.0

Shuffling: predictions_SEEKING_EXPERIENCES
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test(big_df, 'Topic 12', 'PARTNER','SEEKING_EMOTION')

Observed average probability: 0.5940772294998169
Shuffling: predictions_PARTNER
Number of times average is larger than permutated: 0
P-value: 0.0

Shuffling: predictions_SEEKING_EMOTION
Number of times average is larger than permutated: 652
P-value: 0.0652



In [None]:
permutation_test(big_df, 'Topic 12', 'FAMILY','SEEKING_EMOTION')

Observed average probability: 0.6920129656791687
Shuffling: predictions_FAMILY
Number of times average is larger than permutated: 0
P-value: 0.0

Shuffling: predictions_SEEKING_EMOTION
Number of times average is larger than permutated: 52
P-value: 0.0052



In [None]:
permutation_test(big_df, 'Topic 12', 'PARTNER','VENT')

Observed average probability: 0.3288821876049042
Shuffling: predictions_PARTNER
Number of times average is larger than permutated: 0
P-value: 0.0

Shuffling: predictions_VENT
Number of times average is larger than permutated: 9963
P-value: 0.9963



In [None]:
permutation_test(big_df, 'Topic 12', 'FAMILY','VENT')

Observed average probability: 0.4007507562637329
Shuffling: predictions_FAMILY
Number of times average is larger than permutated: 0
P-value: 0.0

Shuffling: predictions_VENT
Number of times average is larger than permutated: 9969
P-value: 0.9969



### Dismissal and Abuse

In [None]:
permutation_test(big_df, 'Topic 7', 'DOCTORS','SEEKING_EMOTION')

Observed average probability: 0.5280482172966003
Shuffling: predictions_DOCTORS
Number of times average is larger than permutated: 9996
P-value: 0.9996

Shuffling: predictions_SEEKING_EMOTION
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test(big_df, 'Topic 7', 'DOCTORS','VENT')

Observed average probability: 0.6750293374061584
Shuffling: predictions_DOCTORS
Number of times average is larger than permutated: 9471
P-value: 0.9471

Shuffling: predictions_VENT
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test(big_df, 'Topic 7', 'FAMILY','SEEKING_EMOTION')

Observed average probability: 0.7107420563697815
Shuffling: predictions_FAMILY
Number of times average is larger than permutated: 39
P-value: 0.0039

Shuffling: predictions_SEEKING_EMOTION
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test(big_df, 'Topic 7', 'FAMILY','VENT')

Observed average probability: 0.9079230427742004
Shuffling: predictions_FAMILY
Number of times average is larger than permutated: 13
P-value: 0.0013

Shuffling: predictions_VENT
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test(big_df, 'Topic 7', 'PARTNER','SEEKING_EMOTION')

Observed average probability: 0.5937626957893372
Shuffling: predictions_PARTNER
Number of times average is larger than permutated: 4150
P-value: 0.415

Shuffling: predictions_SEEKING_EMOTION
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test(big_df, 'Topic 7', 'PARTNER','VENT')

Observed average probability: 0.7551103234291077
Shuffling: predictions_PARTNER
Number of times average is larger than permutated: 2654
P-value: 0.2654

Shuffling: predictions_VENT
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test(big_df, 'Topic 7', 'ENDO SUPPORT COMMUNITY','SEEKING_EMOTION')

Observed average probability: 0.4644415080547333
Shuffling: predictions_ENDO SUPPORT COMMUNITY
Number of times average is larger than permutated: 10000
P-value: 1.0

Shuffling: predictions_SEEKING_EMOTION
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test(big_df, 'Topic 7', 'ENDO SUPPORT COMMUNITY','VENT')

Observed average probability: 0.6957864165306091
Shuffling: predictions_ENDO SUPPORT COMMUNITY
Number of times average is larger than permutated: 8279
P-value: 0.8279

Shuffling: predictions_VENT
Number of times average is larger than permutated: 0
P-value: 0.0



## Permutation test 2

In [None]:
def permute(input_array):
  # shuffle is inplace, so copy to preserve input
  permuted = input_array.copy().values  # convert to numpy array, avoiding warning
  np.random.shuffle(permuted)
  return permuted  # convert back to pandas

def permutation_test2(ddf, topic, persona, intent):
  col_1 = f'predictions_{persona}' # persona column
  col_2 = f'predictions_{intent}' # intent columns
  obs_cond = (ddf[col_1] > 0) & (ddf[col_2] == 1)
  # Difference between the mean of the values in the first half and the mean of the values in the second half of the corpus
  obs_mean = ddf.loc[obs_cond][topic].mean()
  print(f'Observed average probability: {obs_mean}')

  # Performing 10,000 permutations
  n_permutations = 10000
  flag = 0
  for i in range(n_permutations):
    copy = ddf.copy()  # we copy the original dataframe with the observed data
    copy[col_1] = permute(copy[col_1]) # we permute both the persona and intent columns
    copy[col_2] = permute(copy[col_2])
    perm_cond = (copy[col_1] > 0) & (copy[col_2] == 1)
    perm_mean = copy.loc[perm_cond][topic].mean()
    if perm_mean >= obs_mean:  # we test if the observed difference is lesser
        flag += 1  # we keep count of the number of times the observed difference is larger

  p = flag/n_permutations
  print(f'Number of times average is larger than permutated: {flag}')
  print(f'P-value: {p}\n')

### Empathy

In [None]:
#empathy seek emotional support partner
permutation_test2(big_df, 'Topic 16', 'PARTNER','SEEKING_EMOTION')

Observed average probability: 0.9121054410934448
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
#empathy seek emotional support family
permutation_test2(big_df, 'Topic 16', 'FAMILY','SEEKING_EMOTION')

Observed average probability: 0.6173312067985535
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 16', 'ENDO SUPPORT COMMUNITY','SEEKING_EMOTION')

Observed average probability: 0.740861713886261
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 16', 'DOCTORS','SEEKING_EMOTION')

Observed average probability: 0.30777016282081604
Number of times average is larger than permutated: 0
P-value: 0.0



### Fertility

In [None]:
permutation_test2(big_df, 'Topic 12', 'PARTNER','SEEKING_INFO')

Observed average probability: 0.6289975643157959
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 12', 'FAMILY','SEEKING_INFO')

Observed average probability: 0.5246304273605347
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 12', 'PARTNER','SEEKING_EXPERIENCES')

Observed average probability: 0.5774689316749573
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 12', 'FAMILY','SEEKING_EXPERIENCES')

Observed average probability: 0.6955799460411072
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 12', 'PARTNER','SEEKING_EMOTION')

Observed average probability: 0.5940772294998169
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 12', 'FAMILY','SEEKING_EMOTION')

Observed average probability: 0.6920129656791687
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 12', 'PARTNER','VENT')

Observed average probability: 0.3288821876049042
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 12', 'FAMILY','VENT')

Observed average probability: 0.4007507562637329
Number of times average is larger than permutated: 0
P-value: 0.0



### Dismissal and Abuse

In [None]:
permutation_test2(big_df, 'Topic 7', 'DOCTORS','SEEKING_EMOTION')

Observed average probability: 0.5280482172966003
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 7', 'DOCTORS','VENT')

Observed average probability: 0.6750293374061584
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 7', 'FAMILY','SEEKING_EMOTION')

Observed average probability: 0.7107420563697815
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 7', 'FAMILY','VENT')

Observed average probability: 0.9079230427742004
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 7', 'PARTNER','SEEKING_EMOTION')

Observed average probability: 0.5937626957893372
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 7', 'PARTNER','VENT')

Observed average probability: 0.7551103234291077
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 7', 'ENDO SUPPORT COMMUNITY','SEEKING_EMOTION')

Observed average probability: 0.4644415080547333
Number of times average is larger than permutated: 0
P-value: 0.0



In [None]:
permutation_test2(big_df, 'Topic 7', 'ENDO SUPPORT COMMUNITY','VENT')

Observed average probability: 0.6957864165306091
Number of times average is larger than permutated: 0
P-value: 0.0

