In [None]:
!pip install Levenshtein

Collecting Levenshtein
  Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein)
  Downloading rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.25.1 rapidfuzz-3.9.3


In [None]:
import os
import re
import pprint
from collections import Counter
import pandas as pd
import numpy as np
import Levenshtein
import networkx as nx
from google.colab import drive, files

In [None]:
# use to connect to Google Drive if running on Google Colab
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# point to your project directory
endo_dir = '/content/drive/MyDrive/endometriosis/'

In [None]:
# replace with the name of your dataset file
dataset_file = os.path.join(endo_dir, 'data','endo+endometriosis.pkl')
data_df = pd.read_pickle(dataset_file)
data_df['year'] = [int(str(x).split('-')[0]) for x in data_df.time]
print(len(data_df))
data_df[:1]

392087


Unnamed: 0,author,id,type,text,url,link_id,parent_id,flair,subreddit,created_utc,time,year
0,endogirl,Endo_c3efp47_comment,comment,Soy is a no go as it increases estrogen in you...,http://www.reddit.com/r/Endo/comments/mrkjh/,t3_mrkjh,t1_c39qhd9,,Endo,1325832269,2012-01-06 06:44:29,2012


In [None]:
# removed deleted comments
indexes = []
for index, row in data_df.iterrows():
  if row['text'] == '[deleted]' or row['text'] == '[removed]':
    indexes.append(index)
len(indexes)

4210

In [None]:
data_df = data_df.drop(indexes).sort_values('created_utc')
len(data_df)

387877

In [None]:
posts = data_df.loc[data_df['type']=='post'].copy()
len(posts)

34715

In [None]:
comments = data_df.loc[data_df['type']=='comment']
len(comments)

353162

## How many duplicated posts?

In [None]:
def check_if_same_subreddit(s1, s2):

  sub1 = s1.split('_')[0]
  sub2 = s2.split('_')[0]
  if sub1 == sub2 =='endometriosis':
    return 1
  else:
    return 0

In [None]:
def check_subreddit(s1, s2):

  sub1 = s1.split('_')[0]
  sub2 = s2.split('_')[0]
  if (sub1 == 'Endo' and sub2 =='endometriosis') or (sub2 == 'Endo' and sub1 =='endometriosis'):
    return 1
  else:
    return 0

In [None]:
def find_duplicates(df, _ratio):  # function to find duplicated posts in the data

  prev_post = ('','')
  map_dict = {}  # dict of authors' posts
  dup = []  # list of duplicates' indexes for removal from dataframe
  for index, row in df.iterrows():  # iterate over posts

    author = row['author']
    post = row['text']
    reddit_id = row['id']

    # if author info is available we compare each post with previous ones by the same author
    # we compare/calculate the similarity between the posts using the Levenshtein distance
    if author != '[deleted]':
      if author in map_dict.keys():
        flag = 0
        idx = 0
        while idx < len(map_dict[author]):# and flag == 0:
            lev = Levenshtein.ratio(post, map_dict[author][idx][1]) # post is in 1 position in the tuple
            if lev > _ratio:
              d = {'id1': reddit_id,
                'id2': map_dict[author][idx][0],
                'lev_ratio': _ratio,
                'xpost': check_subreddit(reddit_id, map_dict[author][idx][0]), # check if duplication is due to combining the two subreddits
                }
              dup.append(d)
              flag = 1
            idx += 1
        if flag == 0:
          map_dict[author].append((reddit_id, post))
      else:
          map_dict[author] = [(reddit_id, post)]

        # if author info is not available we compare each post with the preceding one chronologically
    else:
        lev = Levenshtein.ratio(row['text'], prev_post[1])
        if lev > _ratio:
          dup.append({'id1': reddit_id,
                      'id2': prev_post[0],
                      'lev_ratio': _ratio,
                      'xpost': check_subreddit(reddit_id, prev_post[0]) # check if duplication is due to combining the two subreddits
                      })

    prev_post = (reddit_id, post)

  return dup

In [None]:
%%time
duplicates = {}
for ratio in np.arange(0.3, 1, 0.025): # try different Levenshtein thresholds
  dupes = find_duplicates(posts, ratio)  # find duplicates
  duplicates[ratio] = dupes
  print(f'Number of duplicates: {len(dupes), len(dupes)/len(posts)}')

Number of duplicates: (20306, 0.5849344663690047)
Number of duplicates: (19831, 0.5712516203370301)
Number of duplicates: (19030, 0.5481780210283739)
Number of duplicates: (17751, 0.5113351577128042)
Number of duplicates: (15383, 0.44312256949445483)
Number of duplicates: (8471, 0.24401555523548898)
Number of duplicates: (2296, 0.06613855681981852)
Number of duplicates: (1481, 0.04266167362811465)
Number of duplicates: (1315, 0.03787987901483508)
Number of duplicates: (1272, 0.0366412213740458)
Number of duplicates: (1246, 0.03589226559124298)
Number of duplicates: (1232, 0.035488981708195304)
Number of duplicates: (1224, 0.035258533775025205)
Number of duplicates: (1217, 0.035056891833501365)
Number of duplicates: (1208, 0.03479763790868501)
Number of duplicates: (1215, 0.03499927985020884)
Number of duplicates: (1195, 0.03442316001728359)
Number of duplicates: (1181, 0.03401987613423592)
Number of duplicates: (1178, 0.03393345815929713)
Number of duplicates: (1169, 0.0336742042344807

In [None]:
long_list_duplicates = []
for key, value in duplicates.items():
  long_list_duplicates.extend(value)
len(long_list_duplicates)

127913

In [None]:
# create df of all potential duplicates based on different Levenshtein thresholds
dupes_df = pd.DataFrame(long_list_duplicates)
dupes_df[:1]

Unnamed: 0,id1,id2,lev_ratio,xpost
0,Endo_p0e2x_post,Endo_otb0m_post,0.3,0


In [None]:
# check that all Levenshtein thresholds in the df
dupes_df['lev_ratio'].unique()

array([0.3  , 0.325, 0.35 , 0.375, 0.4  , 0.425, 0.45 , 0.475, 0.5  ,
       0.525, 0.55 , 0.575, 0.6  , 0.625, 0.65 , 0.675, 0.7  , 0.725,
       0.75 , 0.775, 0.8  , 0.825, 0.85 , 0.875, 0.9  , 0.925, 0.95 ,
       0.975])

In [None]:
# group results by thresholds
new_dupes_df = dupes_df[['lev_ratio', 'xpost']].groupby('lev_ratio').sum()

In [None]:
# create column of percentage of crossposting per threshold
new_dupes_df['xpost_perc'] = new_dupes_df['xpost'].apply(lambda x: round(x/34715*100))

In [None]:
# create column of total number of duplicated posts per threshold
new_dupes_df['total'] = dupes_df.groupby('lev_ratio').count().id1.values

In [None]:
# create column of percentage of duplicated posts per threshold
new_dupes_df['total_perc'] = new_dupes_df['total'].apply(lambda x: round(x/34715*100))

In [None]:
new_dupes_df

Unnamed: 0_level_0,xpost,xpost_perc,total,total_perc
lev_ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.3,5287,15,20306,58
0.325,5117,15,19831,57
0.35,4875,14,19030,55
0.375,4571,13,17751,51
0.4,4020,12,15383,44
0.425,2498,7,8471,24
0.45,1267,4,2296,7
0.475,1112,3,1481,4
0.5,1078,3,1315,4
0.525,1069,3,1272,4


In [None]:
# sanity check of threshold
dupes_df[dupes_df['lev_ratio'] > 0.451][:1]

Unnamed: 0,id1,id2,lev_ratio,xpost
103068,Endo_xmdo4_post,Endo_vix9q_post,0.475,0


In [None]:
posts[posts['id'] == 'Endo_xmdo4_post']['text'].values

array(['I get this pain right underneath my right rib and it hurts! I know it always starts to hurt when my PMS starts. If I eat a piece of bread it gets worse. I was just wondering... if anyone else had this to add on their list of never ending symptoms.'],
      dtype=object)

In [None]:
posts[posts['id'] == 'Endo_vix9q_post']['text'].values

array(["I was always one that lived on 4 hours of sleep, always wanted to go out, now I'm just tired all the time. I was wondering if this has happened to anyone else?"],
      dtype=object)

In [None]:
# sanity check of threshold
dupes_df[dupes_df['lev_ratio'] > 0.476][:1]

Unnamed: 0,id1,id2,lev_ratio,xpost
104549,Endo_11uspq_post,Endo_11ukbn_post,0.5,0


In [None]:
posts[posts['id'] == 'Endo_11uspq_post']['text'].values

array(["I turned 30 two weeks ago. I will be having a full hysterectomy on November 12, 2012. I have been suffering from endometriosis for almost four years. I've had a D&amp;C and 3 laparoscopic surgeries. I take pain killers almost daily just to function at work. \n\nBasically I'm scared. I have no idea what to expect after a hysterectomy. I've read many articles online and haven't found any comforting information. \n\nAny advice or suggestions are welcomed and appreciated."],
      dtype=object)

In [None]:
posts[posts['id'] == 'Endo_11ukbn_post']['text'].values

array(["I am 30 and going in for a full hysterectomy on November 2, 2012. I have been suffering from endometriosis and severe pain for almost four years. My adhesions are extensive and involve my intestines and ovaries. I have no fallopian tubes and three new cysts, all my past cysts have been endomitriomas. I have tried countless of treatments and  it continues to come back. I've had 3 laparoscopic surgeries in the last 3 years and a D&amp;C. \n\nI just turned 30 two weeks ago and I have no children. I'm just really scared on how a hysterectomy will affect my body. Any advice is welcomed and appreciated."],
      dtype=object)

## How many mentions of cross-posting?

We use keyword methods because this is mostly a sanity check that we don't have too much crossposting between r/Endo and r/endometriosis

In [None]:
# mention of keywords about crossposting among duplicates
xpost_dupes = []
not_xpost_dupes = []
for index, row in posts.loc[dupes].iterrows():
  l = ['cross post','crosspost','cross-post','xpost', 'x-post', 'r/endo', 'r/endometriosis']
  t = row['text'].lower()
  if l[0] in t or l[1] in t or l[2] in t or l[3] in t or l[4] in t or l[5] in t or l[6] in t:
    xpost_dupes.append(index)

In [None]:
len(xpost_dupes)

83

In [None]:
# mentions of subreddit name in all posts
rpost = []
update = []
l = ['r/endo', 'r/endometriosis']

for index, row in posts.iterrows():

  t = row['text'].lower()

  if l[0] in t or l[1] in t:
     rpost.append(index)

In [None]:
len(rpost)

359

In [None]:
len(np.intersect1d(dupes,rpost))

74

In [None]:
# mention of crossposting keywords in all posts
%%time
xpost = []
l = ['cross post','crosspost','cross-post','xpost', 'x-post']

for index, row in posts.iterrows():

  t = row['text'].lower()

  if l[0] in t or l[1] in t or l[2] in t or l[3] in t or l[4] in t:
     xpost.append(index)

CPU times: user 3.65 s, sys: 17.3 ms, total: 3.67 s
Wall time: 3.78 s


In [None]:
len(xpost)

85

In [None]:
len(np.intersect1d(dupes,xpost))

35

In [None]:
# mention of both crossposting keyword and subreddit name
xrpost = []
l = ['cross post','crosspost','cross-post','xpost', 'x-post', 'r/endo', 'r/endometriosis']
l2 = ['r/endo', 'r/endometriosis']

for index, row in posts.iterrows():

  t = row['text'].lower()

  if (l[0] in t or l[1] in t or l[2] in t or l[3] in t or l[4] in t) and (l2[0] in t or l2[1] in t):
     xrpost.append(index)

In [None]:
len(xrpost)

47

In [None]:
len(np.intersect1d(dupes,xrpost))

26