# User Trajectory Analyses

In [1]:
from helpers import bins_y_s, add_user_categories_to_bin, mean_confidence_interval, normalize_user_bins, \
                    find_users_constraint, find_users_other_bin, estimate_for_users
from tqdm import tqdm
import pandas as pd
import pickle
import json
import random

In [2]:
SRC = '/dlabdata1/manosphere-rad/'
df_sources = pd.read_csv(SRC + "sources_final_trimmed.csv")

In [3]:
def to_control(i):
    cat = i["category"]
    if cat in cats_to_control:
        i["category"] = "Control"
    return i

cats_to_control = ["center", "left-center", "right-center", "left", "right"]
cats_all = cats_to_control + ["PUA", "Alt-right", "MGTOW", "MRA", "Incel"]
cats = [cat for cat in cats_all if cat not in cats_to_control]
cats.append("Control")

In [4]:
# Create or use already computed checkpoint?
CREATE_BIN_USERS = True

if CREATE_BIN_USERS:
    # Read authors
    with gzip.open(SRC+"users_youtube.json.gz", 'rb') as f:
        authors_dict = json.load(f)

    # Create bins
    bin_users = {b:dict() for b in bins_y_s}

    min_num_users = 1
    print(len(authors_dict))
    for idx, (key, item) in  tqdm(enumerate(authors_dict.items())):
        relevant_items = [to_control(i) for i in item if i["category"] in cats_all
                                              and i["channel_id"] in df_sources.Id.values]

        if len(relevant_items) < min_num_users:
            continue
        for comment in relevant_items:
            add_user_categories_to_bin(bin_users, key, comment["category"], comment["timestamp"])

    normalize_user_bins(bin_users)

    # Checkpoint
    with open("../data/bin_users.pickle", "wb") as f:
        pickle.dump(bin_users, f)
elif CREATE_DF:
    with open("../data/bin_users.pickle", "rb") as f:
        bin_users = pickle.load(f)

## Final graph data

In [5]:
constraints_names = [
        "MGTOW", 
        "MRA",  
        "Incel", 
        "PUA",
        "Manosphere",
        "Control",
        "Alt-lite or I.D.W.", 
        "Alt-lite",
        "I.D.W.",
    ]

constraints = [
    lambda x: x["MGTOW"] != 0 and x["MGTOW"] + x["Intellectual Dark Web"] + x["Alt-lite"]== 1,
    lambda x: x["MRA"] != 0 and x["MRA"] + x["Intellectual Dark Web"] + x["Alt-lite"] == 1,
    lambda x: x["Incel"] != 0 and x["Incel"] + x["Intellectual Dark Web"] + x["Alt-lite"] == 1,
    lambda x: x["PUA"] != 0 and x["PUA"] + x["Intellectual Dark Web"] + x["Alt-lite"] == 1,
    lambda x: x["MGTOW"] + x["Incel"] + x["MRA"] + x["PUA"] != 0 and x["MGTOW"] + x["Incel"] + x["MRA"] \
              + x["PUA"] + x["Intellectual Dark Web"] + x["Alt-lite"] == 1,
    lambda x: x["Control"] != 0 and x["Control"] + x["Intellectual Dark Web"] + x["Alt-lite"] == 1,
    lambda x: x["Intellectual Dark Web"] + x["Alt-lite"] == 1,
    lambda x: x["Alt-lite"] == 1,
    lambda x: x["Intellectual Dark Web"] == 1,
]

estimates = []
p = (1,9999)

for lamb, cname in zip(constraints, constraints_names):
    already_tracked = set()

    for start in list(range(4)):
        non_radical = find_users_constraint(bin_users, bins_y_s[start], lamb)
        non_radical = set(non_radical) - set(already_tracked)
        already_tracked = already_tracked.union(non_radical)
        non_radical = list(non_radical)

        tmp = estimate_for_users(bin_users, bins_y_s[start], non_radical, 
                                 lambda xs: mean_confidence_interval([p[1] > x["Alt-right"] *  
                                                                      x["count"] >= p[0] for x in xs]))

        tmp["idxo"] = start
        tmp["idx"] = bins_y_s[start]
        tmp["start"] = start
        tmp["p"] = p
        tmp["numUsersStart"] = len(non_radical)
        tmp["numUsersTracked"] = len(non_radical)
        tmp["pUsersTracked"] = 1
        tmp["constraint"] = cname
        tmp["numUsersInfected"] = 0
        tmp["pUsersInfected"] = 0

        estimates.append(tmp)
        count = 0
        for bin_key in bins_y_s[start+1:]:
            count += 1
            tracked_users = find_users_other_bin(bin_users, bin_key, non_radical)

            users_who_watched_ar = find_users_constraint(bin_users, bin_key, 
                      lambda x: p[1] > x["Alt-right"] * x["count"] >= p[0])

            tmp = estimate_for_users(bin_users, bin_key, tracked_users,
                                 lambda xs: mean_confidence_interval([p[1] > x["Alt-right"] *  
                                                                      x["count"] >= p[0] for x in xs]))


            tmp["idx"] =  bin_key
            tmp["idxo"] = start + count
            tmp["p"] = p
            tmp["start"] = start
            tmp["numUsersStart"] = len(non_radical)
            tmp["numUsersTracked"] = len(tracked_users)
            tmp["pUsersTracked"] = len(tracked_users)/len(non_radical) if len(non_radical) != 0 else 0
            tmp["numUsersInfected"] = len(tracked_users) * tmp["mean"]
            tmp["pUsersInfected"] =  tmp["numUsersInfected"]/len(users_who_watched_ar)

            tmp["constraint"] = cname

            estimates.append(tmp)

df = pd.DataFrame(estimates)

with open("../helper_files/global_user_df.pickle", "wb") as f:
    pickle.dump(df, f)