In [None]:
import logging
from collections import defaultdict

import networkx as nx
import praw
import json
import pandas as pd
import tqdm.notebook as tqdm
from tinydb import TinyDB, Query
import hashlib

# Thanks to https://github.com/almayor/reddit-mod-overlap for the great work
# and permission to use this code

In [None]:
logging.basicConfig(level=logging.INFO)

Mod = Query()
Bot = Query()
Sub = Query()

In [None]:
KNOWN_BOTS = {
    'mod-mentions',
    'hive-protect',
    'comment-nuke',
    'purge-user',
    'spam-nuke',
    'modmail-userinfo',
    'onedayflair',
    'AutoModerator'
}

# determines left or right
OUTPUTFILEPATH = "../test_subredditdata/mods/right/"
INPUTJSONNAME = "../input/rightsubreddits.json"

SUBSLIST = []
KNOWN_QUARANTINED = {"TheRedPill"}
with open(INPUTJSONNAME, 'r') as subsjson:
    data = json.load(subsjson)
    SUBSLIST = data["subreddits"][:]
print(SUBSLIST)


In [None]:
class Wanderer:

    reddit = None
    db_mods = None
    db_bots = None
    db_subs = None
    
    def __init__(self):
        self.reddit = self.reddit or praw.Reddit(
            client_id="REDACTED",
            client_secret="REDACTED",
            user_agent="REDACTED",
            username="REDACTED",
            password="REDACTED")
        
        self.db_mods = self.db_mods or TinyDB(OUTPUTFILEPATH + 'mods.json')
        self.db_bots = self.db_bots or TinyDB(OUTPUTFILEPATH + 'bots.json')
        self.db_subs = self.db_subs or TinyDB(OUTPUTFILEPATH + 'subs.json')
        
        self.subs_searched = set()
        self.mods_searched = set()


    def get_sub_size(self, sub_name):
        if not self.db_subs.search(Sub.name == sub_name):
            nsubscr = self.reddit.subreddit(sub_name).subscribers
            self.db_subs.insert({'name': sub_name, 'nsubscr': nsubscr})
            return nsubscr
        else:
            return self.db_subs.search(Sub.name == sub_name)[0]['nsubscr']

    
    def wander(self, depth, start_sub_name=None, mods_queue=None, subs_queue=None):
        self.mods_queue = mods_queue or []
        self.subs_queue = subs_queue or []
        if isinstance(start_sub_name, list):
            self.subs_queue = [self.reddit.subreddit(s) for s in start_sub_name]
        elif isinstance(start_sub_name, str):
            self.subs_queue = [self.reddit.subreddit(start_sub_name)]

        while depth >= 0:
            for sub in tqdm.tqdm(self.subs_queue):
                sub_name = sub.display_name
                logging.info(f'Processing sub {sub_name}')
                for mod in sub.moderator():
                    mod_name = mod.name
                    if self.is_bot(mod):
                        logging.info(f'BOT: {mod_name}')
                    elif mod_name not in self.mods_searched:
                        self.mods_queue.append(mod)
                        logging.info(f'Added mod {mod_name} to queue')
                self.subs_searched.add(sub_name)
            logging.info(f'{len(self.mods_queue)} mods in queue')
            self.subs_queue = []

            for mod in tqdm.tqdm(self.mods_queue):
                mod_name = mod.name
                logging.info(f'Processing mod {mod_name}')
                for sub in mod.moderated():
                    sub_name = sub.display_name
                    if sub.subscribers < 1000:
                        logging.info(f'TINY SUB: {sub_name}')
                        continue
                    if not (self.db_mods.search((Mod.mod == mod_name) & (Mod.sub == sub_name))):
                        self.db_mods.insert({'mod': mod_name, 'sub': sub_name})
                    if sub_name not in self.subs_searched:
                        self.subs_queue.append(sub)
                        logging.info(f'Added sub {sub_name} to queue')
                self.mods_searched.add(mod_name)
            logging.info(f'{len(self.subs_queue)} subs in queue')
            self.mods_queue = []
    
            depth -= 1

    def nodepth_wander(self, start_sub_name=None, mods_queue=None, subs_queue=None):
        self.mods_queue = mods_queue or []
        self.subs_queue = subs_queue or []

        def enter_subreddit(cur_sub_name):
            current_sub = self.reddit.subreddit(cur_sub_name)
            if current_sub in KNOWN_QUARANTINED:
                current_sub.quaran.opt_in()
            return current_sub

        if isinstance(start_sub_name, list):
            self.subs_queue = [enter_subreddit(s) for s in start_sub_name]
        elif isinstance(start_sub_name, str):
            self.subs_queue = [enter_subreddit(start_sub_name)]

        for sub in tqdm.tqdm(self.subs_queue):
            sub_name = sub.display_name
            logging.info(f'Processing sub {sub_name}')
            for mod in sub.moderator():
                mod_name = mod.name
                if self.is_bot(mod):
                    logging.info(f'BOT: {mod_name}')
                elif not (self.db_mods.search((Mod.mod == mod_name) & (Mod.sub == sub_name))):
                    self.db_mods.insert({'mod': mod_name, 'sub': sub_name})
            self.subs_searched.add(sub_name)


    def is_bot(self, mod):
        mod_name = mod.name
        if self.db_bots.search(Bot.name == mod_name):
            return True
        if mod_name.lower().startswith('bot'):
            self.db_bots.insert({'name': mod_name})
            return True
        if mod_name.lower().endswith('bot'):
            self.db_bots.insert({'name': mod_name})
            return True
        if mod_name in KNOWN_BOTS:
            self.db_bots.insert({'name': mod_name})
            return True
        if len(mod.moderated()) > 500:
            self.db_bots.insert({'bot': mod_name})
            return True 
        return False
            

In [None]:
wanderer = Wanderer()
# wanderer.wander(0, SUBSLIST)
wanderer.wander(1, "conservatives")

In [None]:
df_mods = pd.DataFrame(wanderer.db_mods.all())
print(df_mods['sub'].nunique())
print(df_mods['sub'].count())

In [None]:
for sub in tqdm.tqdm(df_mods['sub'].unique()):
    wanderer.get_sub_size(sub)

In [None]:
df_subs = pd.DataFrame(wanderer.db_subs.all())
print(df_subs)
df_mods = df_mods.merge(df_subs, left_on='sub', right_on='name', how='left')

In [None]:
# df_mods = df_mods[df_mods['nsubscr'] > 5000]

In [None]:
print(df_mods['sub'].nunique())
print(df_mods['sub'].count())

In [None]:
df_nodes = df_mods.groupby('sub')['mod'].apply(list).reset_index()
# degree filter

selected_subs = df_nodes['sub'].unique()
# selected_subs = df_nodes[df_nodes['mod'].str.len() >= 2]['sub'].unique()
print(selected_subs)


In [None]:
filter = df_mods['sub'].isin(selected_subs)
df_edges = df_mods[filter].groupby('mod')['sub'].apply(list).reset_index()

In [None]:
print(len(df_edges))
df_edges.head()

In [None]:
edge_weight_dict = defaultdict(int)
for _, row in tqdm.tqdm(df_edges.iterrows(), total=len(df_edges)):
    if len(row['sub']) > 1:
        mod = row['mod']
        for i, sub1 in enumerate(row['sub']):
            for sub2 in row['sub'][:i]:
                key = (sub1, sub2) if sub1 < sub2 else (sub2, sub1)
                edge_weight_dict[key] += 1 

In [None]:
g = nx.Graph()

for _, row in df_subs[df_subs['name'].isin(selected_subs)].iterrows():
    g.add_node(row['name'], size=row['nsubscr'])

for (sub1, sub2), weight in edge_weight_dict.items():
    g.add_edge(sub1, sub2, weight=weight)

In [None]:
nx.write_gexf(g, OUTPUTFILEPATH + 'mods.gexf')

In [None]:
# anonymize mod usernames

def anonymize_username(u):
    return hashlib.sha256(u.encode()).hexdigest()

db = TinyDB(OUTPUTFILEPATH + 'mods.json')
anon_db = TinyDB(OUTPUTFILEPATH + 'anon_mods.json')

for d in db:
    username, sub = d.values()
    anon_db.insert({"mod" : anonymize_username(username), "sub" : sub})
