# Temporal Analysis, manosphere

In [1]:
from helpers import jaccard, populate_bin_with_channel, bins_y_o, bins_t_o, bin_to_df
import pandas as pd
import pickle
import gzip

SRC = '/dlabdata1/manosphere-rad/'

# aux functions & vars
def helper_df(df, shenanigans=True):
    df["idx_str"] = bins_t_o
    df_years = df.tail(11)
    if shenanigans:
        df_years["x+y"] = df_years["x"] + df_years["y"] - df_years["intersection"] 
        df_years["xyz_p"] = df_years["intersection"] / df_years["x+y"]
        df_years["intersection/x"] = df_years["intersection"] / df_years["x"]
    return df_years

cat_man = ['MGTOW', 'Incel', 'MRA', "PUA", 'TRP']
cats = list(cat_man)
cats.appenerd('Alt-right')
cats_all = list(cats)
cats.append('Manosphere')
cat_control = []
for cat in cats:
    cat_control.append('Control'+cat)

## Common Commenting Users Along the Years

Computation pipeline

In [2]:
# Compute number of unique users per subreddits and category
# Use checkpoint or create new data?
CREATE_DATA_USERS = False
if CREATE_DATA_USERS:
    with gzip.open(SRC+"users_reddit_random.json.gz", 'rb') as f:
        authors_dict = json.load(f)
    unique_users = {'categories': defaultdict(int), 'subreddits': defaultdict(int)}

    for author, comments in tqdm(authors_dict.items()): 
        is_present = {'categories': defaultdict(bool), 'subreddits': defaultdict(bool)}
        for comment in comments:
            cat = comment['category']
            is_present['subreddits'][comment['channel_id']] = True
            is_present['categories'][cat] = True
            if cat in cat_man:
                is_present['categories']['Manosphere'] = True
        for meta, items in is_present.items():
            for key, val in items.items():
                if val:
                    unique_users[meta][key] += 1
    
    print(unique_users['categories'])
    
    with open(f"../data/unique_user-reddit.pickle", "wb") as f:
        pickle.dump(unique_users, f)
else:
    with open(f"../data/unique_user-reddit.pickle", "rb") as f:
        unique_users = pickle.load(f)
    print(unique_users['categories'])

In [3]:
# Read channels
with gzip.open(SRC+"subreddits_random.json.gz", 'rb') as f:
    channel_dict = json.load(f)
    
BOOTSTRAP_ITERATIONS = 100
    
# Create bins and populate them with channels
bins_dict = {}
for cat in cats:
    bins_dict[cat] = {b:set() for b in bins_y_o}
for cat in cat_control:
    for i in range(BOOTSTRAP_ITERATIONS):
        bins_dict[cat+str(i)] = {b:set() for b in bins_y_o}
    
def create_and_populate_control(category):
    for iteration_index in range(BOOTSTRAP_ITERATIONS):
        seed = 42 + cats.index(category)*BOOTSTRAP_ITERATIONS + iteration_index
        random.seed(seed)
        keys = list(channel_dict.keys())
        random.shuffle(keys)
        goal = unique_users['categories'][category]

        for channel in keys:
            if goal - unique_users['subreddits'][channel] < -1000 or\
                channel_dict[channel][0]['category'] != 'Control':
                continue

            populate_bin_with_channel(channel, channel_dict, bins_dict['Control'+category+str(iteration_index)])

            goal -= unique_users['subreddits'][channel]
            if goal < 0:
                break

for cat in cats_all:
    final_cat = cat
    for channel, comments in channel_dict.items():
        if comments[0]['category'] == cat:
            populate_bin_with_channel(channel, channel_dict, bins_dict[final_cat])
            if cat in cat_man:
                populate_bin_with_channel(channel, channel_dict, bins_dict['Manosphere'])
    create_and_populate_control(cat)
create_and_populate_control('Manosphere')

In [4]:
# Creates shifted bins
bins_sl_dict = {}

for cat, bin_dict in bins_dict.items():
    bin_sl_dict = {b:set() for b in bins_y_o}
    for idx in range(len(list(bin_dict.keys())) - 1):
        shifted = sorted(bin_dict.keys())[idx+1]
        actual = sorted(bin_dict.keys())[idx]
        bin_sl_dict[shifted] = bin_dict[actual]
    bins_sl_dict[cat] = bin_sl_dict

In [5]:
# Create bins intersection given the specified function, here, `jaccard`
df_inter = {}
    
for cat1 in cats:
    for cat2 in cats:
        if (cat2, cat1) in df_inter:
            df_inter[(cat1, cat2)] = df_inter[(cat2, cat1)]
        else:
            bins2 = bins_dict if cat1 != cat2 else bins_sl_dict
            df_inter[(cat1, cat2)] = helper_df(bin_to_df(bins_y_o, bins_dict[cat1], bins2[cat2], jaccard))
with open(f"../data/intersections-reddit.pickle", "wb") as f:
    pickle.dump(df_inter, f)

In [6]:
df_control = {}
for cat in cats:
    inters = []
    for bootstrap_id in range(BOOTSTRAP_ITERATIONS):
        inters.append(helper_df(bin_to_df(bins_y_o, bins_dict[cat], 
                                          bins_dict['Control'+cat+str(bootstrap_id)], 
                                          jaccard)))
        
    values = {y:[] for y in bins_t_o[3:]}
    for inter in inters:
        for i in range(len(inter.index)):
            year = list(inter['idx_str'])[i]
            values[year].append(list(inter['jaccard'])[i])
    print(cat)
    for year, pct in values.items():
        print(f' - {year}: {100*np.mean(pct):.2f}% (stdev: {100*np.std(pct):.2f}%)')

    df_control[cat] = inters[0].set_index('idx_str')['jaccard']
    for inter in inters[1:]:  
        df_control[cat] += inter.set_index('idx_str')['jaccard']
    df_control[cat] /= BOOTSTRAP_ITERATIONS

#Alt-right compared with sample of community size
for cat in cats:
    inters = []
    for bootstrap_id in range(BOOTSTRAP_ITERATIONS):
        inters.append(helper_df(bin_to_df(bins_y_o, bins_dict["Alt-right"], 
                                          bins_dict['Control'+cat+str(bootstrap_id)], 
                                          jaccard)))
        
    values = {y:[] for y in bins_t_o[3:]}
    for inter in inters:
        for i in range(len(inter.index)):
            year = list(inter['idx_str'])[i]
            values[year].append(list(inter['jaccard'])[i])
    print("Alt-right+"+cat)
    for year, pct in values.items():
        print(f' - {year}: {100*np.mean(pct):.2f}% (stdev: {100*np.std(pct):.2f}%)')

    df_control["Alt-right"+cat] = inters[0].set_index('idx_str')['jaccard']
    for inter in inters[1:]:  
        df_control["Alt-right"+cat] += inter.set_index('idx_str')['jaccard']
    df_control["Alt-right"+cat] /= BOOTSTRAP_ITERATIONS
    

with open(f"../data/intersections_control-reddit.pickle", "wb") as f:
    pickle.dump(df_control, f)