In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import timedelta
import pandas as pd
import numpy as np
import copy
import os.path
import sys
import warnings
warnings.filterwarnings('ignore')


# Local Modules
sys.path.insert(0, os.path.abspath('/data/manoel/platform_bans/'))
from helpers.regression_helpers import get_slice_date_venue, set_intervention_stuff
from helpers.regression_helpers import get_content_helper
from helpers.vars import interventions, grace_period, exclude_dates
from helpers.match_helpers import get_matched_dataframes

exclude_dates2 = copy.deepcopy(exclude_dates)
exclude_dates2["/r/Incels"].append([pd.to_datetime("2017-12-03"), pd.to_datetime("2017-12-03")] )

# Loads data + basic statistics

In [3]:
DATA_PATH = "/data/manoel/platform_bans/data/"
df = pd.read_feather(DATA_PATH + "processed_merged.f")
tdcons = ((df.venue == "/r/The_Donald") | (df.venue == "thedonald.win")) &\
        ((df.date_post >= interventions['/r/The_Donald']["Measure"] - timedelta(days=120)) &\
        (df.date_post <= interventions['/r/The_Donald']["Measure"] + timedelta(days=119)))

incons = ((df.venue == "/r/Incels") | (df.venue == "incels.co")) &\
        ((df.date_post >= interventions['/r/Incels']["Measure"] - timedelta(days=120)) &\
        (df.date_post <= interventions['/r/Incels']["Measure"] + timedelta(days=119)))

tostats = df.loc[incons | tdcons]
tostatsgb = tostats.groupby("venue")\
    .agg({"author": pd.Series.nunique,
          'type': [lambda x: np.sum(x == "submission"), 
                   lambda x: np.sum(x == "comment")],
          }).T.reset_index(drop=True).rename({0: "Unique Authors", 1: "Submissions", 2: "Comments"}).T
del tostats
tostatsgb

Unnamed: 0_level_0,Unique Authors,Submissions,Comments
venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
/r/Incels,18088,17403,340650
/r/The_Donald,80002,251090,2703615
incels.co,2270,25139,385765
thedonald.win,38510,280156,2390641


# Community level

In [4]:
# (Activity)

df.loc[df["venue"] == "incels.co", "venue"] = "/r/Incels"
df.loc[df["venue"] == "thedonald.win", "venue"] = "/r/The_Donald"

# Gets mean values per day per venue
df_activity = df.loc[(~df.body.isna())]\
                .groupby(["venue", pd.Grouper(key='date_post', freq='d')])\
                .agg({"id": len, 
                      "author": pd.Series.nunique,
                      'length': np.nanmean
                      }).reset_index()

# Restricts data to 60 days before and after intervention
df_activity = df_activity.loc[
    get_slice_date_venue(df_activity, "/r/The_Donald", interventions["/r/The_Donald"]["Measure"], 120, 119) |
    get_slice_date_venue(df_activity, "/r/Incels", interventions["/r/Incels"]["Measure"], 120, 119) 
]

# Gets number of NEW authors per day
df_first = df.groupby(["author", "venue"]).date_post.min().reset_index()
df_first_grouped = df_first.groupby(["venue", pd.Grouper(key='date_post', freq='d')]).count()
df_first_grouped = df_first_grouped.rename({"author": "first"}, axis=1)
df_first_grouped.reset_index(inplace=True)
df_all = df_activity.merge(df_first_grouped, 
                           left_on=["venue", "date_post"], 
                           right_on=["venue", "date_post"], 
                           how="inner")

# Gets micro fraction between number of comments and number of authors
df_all["idpauthor"] = df_all["id"] / df_all["author"]

# sets intervention stuff
df_all["intervention_flag"] = 0
set_intervention_stuff(df_all, "/r/Incels", interventions["/r/Incels"]["Measure"])
set_intervention_stuff(df_all, "/r/The_Donald", interventions["/r/The_Donald"]["Measure"])
df_all.date_idx = df_all.date_idx.apply(lambda x: x.days)

# saves data
df_all.to_csv("./data/reproducibility_data/activity_agg.csv", index=False)

# (Content)

# Gets mean values per day per venue
fixation_dict = ((df.fixation_dict_incels > 0) & (df.venue == "/r/Incels") |
                (df.fixation_dict_td > 0) & (df.venue == "/r/The_Donald"))

df_content = get_content_helper(df, days_before=120, days_after=119)
df_norm = get_content_helper(df[fixation_dict], days_before=120, days_after=119)
df_content.to_csv("./data/reproducibility_data/content_agg.csv", index=False)
df_norm.to_csv("./data/reproducibility_data/content_agg_fixation_dict.csv", index=False)

# Prepares user level dataframes

In [5]:
# Prepares matched dataframes!
df = pd.read_feather(DATA_PATH + "processed_merged.f")

pairs_td, df_td_users_matched, df_gb_td, df_before_after_td = get_matched_dataframes(
    df_ = df,
    reddit_venue="/r/The_Donald",
    fringe_venue="thedonald.win",
    migration_date=interventions["/r/The_Donald"]["Measure"],
    grace_period=grace_period["/r/The_Donald"],
    days_before=120,
    days_after=119
)

pairs_in, df_in_users_matched, df_gb_in, df_before_after_in = get_matched_dataframes(
    df_ = df,
    reddit_venue="/r/Incels",
    fringe_venue="incels.co",
    migration_date=interventions["/r/Incels"]["Measure"],
    grace_period=grace_period["/r/Incels"],
    days_before=120,
    days_after=119
)

pairs_td_f, df_td_users_matched_f, df_gb_td_f, df_before_after_td_f = get_matched_dataframes(
    df_ = df.loc[df["fixation_dict_td"] > 0],
    reddit_venue="/r/The_Donald",
    fringe_venue="thedonald.win",
    migration_date=interventions["/r/The_Donald"]["Measure"],
    grace_period=grace_period["/r/The_Donald"],
    days_before=120,
    days_after=119
)

pairs_in_f, df_in_users_matched_f, df_gb_in_f, df_before_after_in_f = get_matched_dataframes(
    df_ = df.loc[df["fixation_dict_incels"] > 0],
    reddit_venue="/r/Incels",
    fringe_venue="incels.co",
    migration_date=interventions["/r/Incels"]["Measure"],
    grace_period=grace_period["/r/Incels"],
    days_before=120,
    days_after=119
)

quartiles before [  1.   7.  27. 101.]
quartiles after [ 1  2  6 31]
quartiles before [  1.  19. 116. 398.]
quartiles after [ 1  2  9 64]
quartiles before [ 1.  2.  7. 20.]
quartiles after [1 1 3 9]
quartiles before [  1.  12.  42. 149.]
quartiles after [ 1  2  6 34]


In [6]:
# Gets mean values per day per venue
xum = pd.concat([df_td_users_matched, df_in_users_matched])
d = {"incels.co": "/r/Incels", "thedonald.win": "/r/The_Donald"}
xum["venue"] = xum.venue.apply(lambda x: d[x] if x in d else x)
df_content = get_content_helper(xum, days_before=120, days_after=119)

# Gets mean values per day per venue
xum = pd.concat([df_td_users_matched, df_in_users_matched])
d = {"incels.co": "/r/Incels", "thedonald.win": "/r/The_Donald"}
xum["venue"] = xum.venue.apply(lambda x: d[x] if x in d else x)
fixation_dict = ((xum.fixation_dict_incels > 0) & (xum.venue == "/r/Incels") |
                (xum.fixation_dict_td > 0) & (xum.venue == "/r/The_Donald"))
df_norm = get_content_helper(xum[fixation_dict],  days_before=120, days_after=119)
df_content.to_csv("./data/reproducibility_data/content_matched_agg.csv", index=False)
df_norm.to_csv("./data/reproducibility_data/content_matched_agg_fixation_dict.csv", index=False)

to_drop = ['SEVERE_TOXICITY_x', 'kind_x', 'SEVERE_TOXICITY_y', 'kind_y', 'ptile_after', 'group_after']
df_before_after_td.drop(to_drop, axis=1)\
    .rename({"before":"num_posts_x", "after": "num_posts_y"}, axis=1)\
    .to_csv("./data/reproducibility_data/user_matched_td.csv", index=False)
df_before_after_in.drop(to_drop, axis=1)\
    .rename({"before":"num_posts_x", "after": "num_posts_y"}, axis=1)\
    .to_csv("./data/reproducibility_data/user_matched_incels.csv", index=False)

In [7]:
to_drop = ['SEVERE_TOXICITY_x', 'kind_x', 'SEVERE_TOXICITY_y', 'kind_y', 'ptile_after', 'group_after']
df_before_after_td.drop(to_drop, axis=1)\
    .rename({"before":"num_posts_x", "after": "num_posts_y"}, axis=1)\
    .to_csv("./data/reproducibility_data/user_matched_td.csv", index=False)
df_before_after_in.drop(to_drop, axis=1)\
    .rename({"before":"num_posts_x", "after": "num_posts_y"}, axis=1)\
    .to_csv("./data/reproducibility_data/user_matched_incels.csv", index=False)

to_drop = ['author']
df_gb_td.drop(to_drop, axis=1)\
    .rename({"body":"num_posts"}, axis=1)\
    .to_csv("./data/reproducibility_data/user_td.csv", index=False)
df_gb_in.drop(to_drop, axis=1)\
    .rename({"body":"num_posts"}, axis=1)\
    .to_csv("./data/reproducibility_data/user_incels.csv", index=False)

to_drop = ['SEVERE_TOXICITY_x', 'kind_x', 'SEVERE_TOXICITY_y', 'kind_y', 'ptile_after', 'group_after']
df_before_after_td_f.drop(to_drop, axis=1)\
    .rename({"before":"num_posts_x", "after": "num_posts_y"}, axis=1)\
    .to_csv("./data/reproducibility_data/user_matched_td_f.csv", index=False)
df_before_after_in_f.drop(to_drop, axis=1)\
    .rename({"before":"num_posts_x", "after": "num_posts_y"}, axis=1)\
    .to_csv("./data/reproducibility_data/user_matched_incels_f.csv", index=False)

---