In [None]:
!pip install networkx --quiet

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from google.colab import drive, files
import networkx as nx

In [None]:
# connect to Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# point to your project directory
endo_dir = '/content/drive/MyDrive/endometriosis/'

In [None]:
types = {'post': 't3', 'comment': 't1'}

In [None]:
dataset_file = os.path.join(endo_dir, 'data','endo+endometriosis.pkl')
data_df = pd.read_pickle(dataset_file).drop(columns=['text', 'flair', 'created_utc', 'url', 'subreddit', 'time'])
print(len(data_df))

392087


In [None]:
# transform id to reddit format to find connections between posts and comments
reddit_ids = []
for our_id in data_df.id:
  sub, alphanumeric, typ = our_id.split('_')
  reddit_id  = f'{types[typ]}_{alphanumeric}'
  reddit_ids.append(reddit_id)

data_df['reddit_id'] = reddit_ids
data_df.head()

Unnamed: 0,author,id,type,link_id,parent_id,reddit_id
0,endogirl,Endo_c3efp47_comment,comment,t3_mrkjh,t1_c39qhd9,t1_c3efp47
1,endogirl,Endo_c3empyk_comment,comment,t3_o5y3y,t3_o5y3y,t1_c3empyk
2,oneautumnday,Endo_c3epqgz_comment,comment,t3_o5y3y,t3_o5y3y,t1_c3epqgz
3,applegoodstomach,Endo_c3epskq_comment,comment,t3_mrkjh,t1_c3efp47,t1_c3epskq
4,[deleted],Endo_c3f4nn3_comment,comment,t3_o5y3y,t3_o5y3y,t1_c3f4nn3


In [None]:
#Load topic modeling file
topic_model_file = os.path.join(endo_dir, 'output', 'topic-modeling', 'parags', 'endo+endometriosis-25_10.pkl')
tomo_df = pd.read_pickle(topic_model_file).reset_index().rename(columns = {"index":"id"}).drop(['og_doc', 'dominant_topic'], axis=1) # change index into column

In [None]:
zscores = tomo_df.copy()
cols = list(zscores.columns[1:26])
# get z scores for each topic column
for col in cols:
  zscores[col] = stats.zscore(zscores[col])

tomo_df = zscores
tomo_df[:1]

Unnamed: 0,id,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,...,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20,Topic 21,Topic 22,Topic 23,Topic 24
0,Endo_c3efp47_comment_0,-0.196283,-0.283443,-0.298325,-0.207986,-0.269278,-0.301317,-0.243336,-0.418346,-0.267528,...,-0.228994,0.303848,-0.259111,-0.285519,3.848055,-0.363482,-0.303682,-0.183964,-0.242979,2.277669


In [None]:
# add column with info about where the paragraph comes from, whether post or comment
tomo_df['og_id'] = ['_'.join(x.split('_')[:3]) for x in tomo_df.id]

# group paragraphs' distributions by the post/comment the paragraph comes from, taking the average of the distributions
new_tomo_df = tomo_df.groupby(['og_id'], sort=False).mean().reset_index().rename(columns = {"og_id":"id"})
print(len(new_tomo_df))
new_tomo_df[:1]

334042


  new_tomo_df = tomo_df.groupby(['og_id'], sort=False).mean().reset_index().rename(columns = {"og_id":"id"})


Unnamed: 0,id,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,...,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20,Topic 21,Topic 22,Topic 23,Topic 24
0,Endo_c3efp47_comment,-0.196283,-0.283443,-0.298325,-0.207986,-0.269278,-0.301317,-0.243336,-0.418346,-0.267528,...,-0.228994,0.303848,-0.259111,-0.285519,3.848055,-0.363482,-0.303682,-0.183964,-0.242979,2.277669


In [None]:
# Merge dataset with topic modeling distributions
big_df = pd.merge(data_df, new_tomo_df, how = "left", left_on = "id", right_on = "id").dropna()
print('Length df after merge:', len(big_df))
big_df[:1]

Length df after merge: 334068


Unnamed: 0,author,id,type,link_id,parent_id,reddit_id,Topic 0,Topic 1,Topic 2,Topic 3,...,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20,Topic 21,Topic 22,Topic 23,Topic 24
0,endogirl,Endo_c3efp47_comment,comment,t3_mrkjh,t1_c39qhd9,t1_c3efp47,-0.196283,-0.283443,-0.298325,-0.207986,...,-0.228994,0.303848,-0.259111,-0.285519,3.848055,-0.363482,-0.303682,-0.183964,-0.242979,2.277669


In [None]:
# remove any dupes from merging
dupes = big_df.id.duplicated()
dupes_idx = dupes.loc[dupes == True].index
print('Number of dupes', len(dupes_idx))
big_df = big_df.drop(dupes_idx).reset_index().drop(columns=['index'])
print('Length df after removing dupes', len(big_df))
big_df[:1]

Number of dupes 26
Length df after removing dupes 334042


Unnamed: 0,author,id,type,link_id,parent_id,reddit_id,Topic 0,Topic 1,Topic 2,Topic 3,...,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20,Topic 21,Topic 22,Topic 23,Topic 24
0,endogirl,Endo_c3efp47_comment,comment,t3_mrkjh,t1_c39qhd9,t1_c3efp47,-0.196283,-0.283443,-0.298325,-0.207986,...,-0.228994,0.303848,-0.259111,-0.285519,3.848055,-0.363482,-0.303682,-0.183964,-0.242979,2.277669


In [None]:
# make tree with replies
DG = nx.DiGraph()
for index, row in big_df.iterrows():
  if row['type'] == 'post':
    DG.add_node(row['reddit_id'])
  elif row['type'] == 'comment':
    DG.add_edge(row['parent_id'], row['reddit_id'])

In [None]:
def check_for_predecessors(levels, level, graph, node):

  parents = list(DG.predecessors(node))

  if parents:
    level += 1
    parent = parents[0]
    check_for_predecessors(levels, level, graph, parent)

  else:
    levels.append(level)

In [None]:
# assign level for each reply
comm_levels = []
for reddit_id in big_df.reddit_id:
  check_for_predecessors(comm_levels, 0, DG, reddit_id)
big_df['level'] = comm_levels

In [None]:
big_df.level.unique()

array([ 1,  2,  0,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [None]:
big_df[:10][['id','link_id','parent_id','reddit_id','level']]

Unnamed: 0,id,link_id,parent_id,reddit_id,level
0,Endo_c3efp47_comment,t3_mrkjh,t1_c39qhd9,t1_c3efp47,1
1,Endo_c3empyk_comment,t3_o5y3y,t3_o5y3y,t1_c3empyk,1
2,Endo_c3epqgz_comment,t3_o5y3y,t3_o5y3y,t1_c3epqgz,1
3,Endo_c3epskq_comment,t3_mrkjh,t1_c3efp47,t1_c3epskq,2
4,Endo_c3hivzc_comment,t3_o5y3y,t3_o5y3y,t1_c3hivzc,1
5,Endo_c3insbt_comment,t3_o5y3y,t3_o5y3y,t1_c3insbt,1
6,Endo_otb0m_post,,,t3_otb0m,0
7,Endo_c3k28n9_comment,t3_mrkjh,t1_c3epskq,t1_c3k28n9,3
8,Endo_c3k29fy_comment,t3_otb0m,t3_otb0m,t1_c3k29fy,1
9,Endo_c3k3mx4_comment,t3_otb0m,t1_c3k29fy,t1_c3k3mx4,2


In [None]:
# selecting only posts
posts_df = big_df.loc[big_df['type'] == 'post'].drop(columns=['level']).copy()
print(len(posts_df))
posts_df[:1]

34190


Unnamed: 0,author,id,type,link_id,parent_id,reddit_id,Topic 0,Topic 1,Topic 2,Topic 3,...,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20,Topic 21,Topic 22,Topic 23,Topic 24
6,theonusta,Endo_otb0m_post,post,,,t3_otb0m,-0.205552,-0.298024,0.329341,-0.218773,...,-0.239455,2.531225,-0.271936,-0.298816,-0.224973,-0.176872,-0.318968,-0.192726,-0.254702,-0.313315


In [None]:
# selecting only comments
comments_df = big_df.loc[big_df['type'] == 'comment'].copy()
print(len(comments_df))
comments_df[:1]

299852


Unnamed: 0,author,id,type,link_id,parent_id,reddit_id,Topic 0,Topic 1,Topic 2,Topic 3,...,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20,Topic 21,Topic 22,Topic 23,Topic 24,level
0,endogirl,Endo_c3efp47_comment,comment,t3_mrkjh,t1_c39qhd9,t1_c3efp47,-0.196283,-0.283443,-0.298325,-0.207986,...,0.303848,-0.259111,-0.285519,3.848055,-0.363482,-0.303682,-0.183964,-0.242979,2.277669,1


## Correlation between comments of different levels and the post they reply to

In [None]:
def bootstrap(dataslice, _topic, _topic_comm):

  # Bootstrap calculations
  trials = 1000
  bootstrapped_corr = []
  k = len(dataslice)
  for i in range(trials):
    sample = dataslice.sample(n=k, replace=True)
    cols = dataslice.columns
    corr = stats.pearsonr(sample[_topic], sample[_topic_comm])
    bootstrapped_corr.append(corr)

  sort = sorted(bootstrapped_corr)
  low = sort[int(trials*0.025)][0]
  high = sort[int(trials*0.975)][0]

  return low, high

In [None]:
topic_list = [col for col in posts_df.columns if 'Topic' in col]

In [None]:
rename_d = {}
for col in topic_list:
  rename_d[col] = f'{col} comments'

In [None]:
extended_corr = {}

# slice by level
for level in comments_df.level.unique()[:10]:
  level_col = f'Level {level}'
  print(level_col)

  extended_corr[level_col] = []
  extended_corr[level_col+' low'] = []
  extended_corr[level_col+' high'] = []

  # group by link_id
  comm_level = comments_df.loc[comments_df['level'] == level].groupby(['link_id'], sort=False).mean().copy()
  comm_level.rename(columns=rename_d, inplace=True)
  print(len(comm_level))

  # merging based on parent_id
  post_comm = pd.merge(posts_df, comm_level, how = "left", left_on = "reddit_id", right_on = "link_id").dropna()
  print(len(post_comm))

  # selecting each topic
  for topic in topic_list:
    topic_comm = f'{topic} comments'
    obv_corr = stats.pearsonr(post_comm[topic], post_comm[topic_comm])[0]
    _low, _high = bootstrap(post_comm[[topic, topic_comm]], topic, topic_comm)

    extended_corr[level_col].append(obv_corr)
    extended_corr[level_col+' low'].append(_low)
    extended_corr[level_col+' high'].append(_high)

In [None]:
extended_corr_df = pd.DataFrame(extended_corr, index=topic_list)
extended_corr_df

Unnamed: 0,Level 1,Level 1 low,Level 1 high,Level 2,Level 2 low,Level 2 high,Level 3,Level 3 low,Level 3 high,Level 4,...,Level 7 high,Level 8,Level 8 low,Level 8 high,Level 9,Level 9 low,Level 9 high,Level 10,Level 10 low,Level 10 high
Topic 0,0.379078,0.351318,0.405341,0.256943,0.221194,0.292579,0.22446,0.181316,0.266556,0.168558,...,0.138273,0.050985,-0.036892,0.169538,0.155384,-0.054661,0.50751,-0.026786,-0.056346,0.038757
Topic 1,0.356473,0.337665,0.376831,0.167139,0.144999,0.188434,0.159421,0.132579,0.189948,0.113677,...,0.097105,0.056229,-0.034181,0.170089,0.010909,-0.077619,0.129056,-0.00652,-0.11326,0.147466
Topic 2,0.372078,0.354891,0.389308,0.276142,0.253401,0.302461,0.17756,0.152353,0.203281,0.135831,...,0.18527,0.200456,0.042491,0.348981,0.091088,-0.031883,0.226317,0.026373,-0.095942,0.191282
Topic 3,0.282761,0.257281,0.309495,0.185025,0.152975,0.215764,0.14956,0.116765,0.184495,0.162236,...,0.186858,0.110707,-0.03765,0.347972,0.059096,-0.048939,0.286068,-0.048837,-0.083099,-0.030043
Topic 4,0.345654,0.326352,0.364934,0.22229,0.19921,0.247117,0.182848,0.156332,0.209592,0.150017,...,0.13039,0.094305,-0.012036,0.254255,0.116188,-0.03355,0.303879,0.056923,-0.065072,0.328754
Topic 5,0.376224,0.358102,0.393269,0.210123,0.188921,0.23328,0.145769,0.121558,0.171748,0.137879,...,0.184377,0.110047,0.004374,0.22496,0.075447,-0.033006,0.234117,0.228017,-0.042161,0.501496
Topic 6,0.295302,0.267954,0.328165,0.208619,0.179564,0.237811,0.117715,0.087798,0.146892,0.170054,...,0.314087,0.233324,0.085493,0.387651,0.124789,-0.008392,0.306385,0.06145,-0.06714,0.274354
Topic 7,0.268353,0.250378,0.285499,0.181416,0.164703,0.199654,0.136454,0.114798,0.15884,0.14452,...,0.155468,0.138149,0.03419,0.241398,0.09967,-0.017985,0.243728,0.257071,0.072629,0.451619
Topic 8,0.337991,0.314575,0.360771,0.249096,0.221096,0.279251,0.188387,0.156838,0.218868,0.164419,...,0.367055,0.204594,0.060382,0.353894,0.132223,-0.031089,0.353048,0.106238,-0.071416,0.360637
Topic 9,0.172285,0.155829,0.188266,0.133011,0.112886,0.153886,0.091221,0.071305,0.113411,0.049588,...,0.163562,0.042328,-0.025931,0.128843,0.040142,-0.085841,0.18001,0.254723,0.005994,0.501397


In [None]:
output_dir = os.path.join(endo_dir, 'output')
extended_corr_df.to_csv(os.path.join(output_dir, 'corr_posts_levels.csv'))

## Correlation between comments and the text they reply to

In [None]:
extended_corr = {}
# slice by level
for level in comments_df.level.unique():
  print(f'LEVEL: {level}')

  # group by parent_id
  comm_level = comments_df.loc[comments_df['level'] == level].groupby(['parent_id'], sort=False).mean().copy()
  rename_d = {}
  for col in comm_level.columns:
    if 'Topic' in col:
      rename_d[col] = f'{col} comments'
  comm_level.rename(columns=rename_d, inplace=True)
  print(len(comm_level))

  # merging based on parent_id
  if level == 1:
    replies = pd.merge(posts_df, comm_level, how = "left", left_on = "reddit_id", right_on = "parent_id").dropna()

  else:
    upper_comm = comments_df.drop(columns=['level','parent_id']).copy()
    replies = pd.merge(upper_comm, comm_level, how = "left", left_on = "reddit_id", right_on = "parent_id").dropna()

  print(len(replies))
  topics_corr = replies.drop(columns=['author', 'id', 'type', 'level']).corr()
  to_drop = topics_corr.columns[0:25]
  topic_corr = topics_corr[0:25].drop(columns=to_drop)
  topic_corr[:5]

  column = f'Level {level}'
  extended_corr[column] = np.diag(topic_corr).tolist()

In [None]:
extended_corr_df = pd.DataFrame(extended_corr, index=topic_corr.index)
extended_corr_df

In [None]:
extended_corr_df.to_csv(os.path.join(output_dir, 'corr_comms_levels.csv'))