# Promoting Belonging in r/feminism
J. Nathan Matias, June 2020

Generating dataset for analysis. Pre-analysis plan at [osf.io/xu258/](https://osf.io/xu258/).

In [None]:
%matplotlib inline
import inspect, os, sys, copy, pytz, re, glob, csv, uuid, requests, time
os.environ['AIRBRAKE_API_KEY'] = "1" ## EDIT BEFORER USING
os.environ['AIRBRAKE_PROJECT_ID'] = "1" ## EDIT BEFORE USING

import simplejson as json
import pandas as pd
from dateutil import parser
import datetime
import matplotlib.pyplot as plt   # Matplotlib for plotting
import matplotlib.dates as md
import numpy as np
import seaborn as sns
from collections import Counter, defaultdict
utc=pytz.UTC

ENV = "production"
os.environ['CS_ENV'] = 'production'
BASE_DIR = "/usr/local/civilservant/platform"
sys.path.append(BASE_DIR)

with open(os.path.join(BASE_DIR, "config") + "/{env}.json".format(env=ENV), "r") as config:
  DBCONFIG = json.loads(config.read())

### LOAD SQLALCHEMY
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy import text, and_, or_
import sqlalchemy.orm.session
import utils.common


db_engine = create_engine("mysql://{user}:{password}@{host}/{database}".format(
    host = DBCONFIG['host'],
    user = DBCONFIG['user'],
    password = DBCONFIG['password'],
    database = DBCONFIG['database']))
DBSession = sessionmaker(bind=db_engine)
db_session = DBSession()


### LOAD PRAW
import praw
r = praw.Reddit(user_agent='research code by /u/natematias')

from app.models import *

### FILTER OUT DEPRECATION WARNINGS ASSOCIATED WITH DECORATORS
# https://github.com/ipython/ipython/issues/9242
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning, message='.*use @default decorator instead.*')

In [None]:
class Dict2Obj(object):
    """
    Turns a dictionary into a class
    """
    #----------------------------------------------------------------------
    def __init__(self, dictionary):
        """Constructor"""
        for key in dictionary:
            setattr(self, key, dictionary[key])

# Load Experiment

In [None]:
def return_experiment_objects(experiment_name):
    experiment_obj = db_session.query(Experiment).filter(
                   Experiment.name == experiment_name
                 ).first()
    subreddit_id = json.loads(experiment_obj.settings_json)['subreddit_id']
    participants = db_session.query(ExperimentThing).filter(
                   ExperimentThing.experiment_id == experiment_obj.id
               ).all()
    usernames = [x.thing_id for x in participants]
    participant_comments = db_session.query(Comment).filter(and_(
                                Comment.user_id.in_(usernames),
                                Comment.subreddit_id == subreddit_id
    )).order_by(Comment.created_utc.asc()).all()

    actions = db_session.query(ExperimentAction).filter(
                                ExperimentAction.experiment_id == experiment_obj.id).all()
    
    return {
        "participants": participants,
        "usernames": usernames,
        "participant_comments": participant_comments,
        "actions": actions,
        "subreddit_id": subreddit_id
    }

### Query Experiment Objects

In [None]:
first_round_results = return_experiment_objects("newcomer_messaging_experiment-feminism-07.2018")
second_round_results = return_experiment_objects("newcomer_messaging_experiment-feminism-01.2020")

In [None]:
participants = first_round_results['participants'] + second_round_results['participants']
usernames = list(set(first_round_results['usernames'] + second_round_results['usernames']))
participant_comments = first_round_results['participant_comments'] + second_round_results['participant_comments']
actions = first_round_results['actions'] + second_round_results['actions']

### Query Modlog

In [None]:
participant_modlog = db_session.query(ModAction).filter(and_(
                            ModAction.target_author.in_(usernames),
                            ModAction.subreddit_id == first_round_results['subreddit_id']
)).order_by(ModAction.created_utc.asc()).all()

# Create Dict of Participants 

In [None]:
def participant_object():
    return {
        "username": None,
        "treatment": None,
        "comment_id": None,
        "submission_id": None,
        "message_status": None,
        "randomization": None,
        "block_id": None,
        "assignment_datetime": None,
        "comments": [],
        "ban_actions": [],
        "comment_actions":[],
        "comments_2_weeks": None,
        "comments_4_weeks": None,
        "comments_8_weeks": None,
        "ban_days_2_weeks": 0,
        "ban_days_4_weeks": 0,
        "ban_days_8_weeks": 0
    }

In [None]:
## key is username
units = defaultdict(participant_object)
for participant in participants:
    metadata = json.loads(participant.metadata_json)
    randomization = metadata['randomization']
    units[participant.thing_id]['block_id']      = randomization["block.id"]
    units[participant.thing_id]['randomization'] = randomization[""]
    units[participant.thing_id]['treatment']     = int(randomization['treatment'])
    units[participant.thing_id]['comment_id']    = metadata["comment_id"]
    units[participant.thing_id]['submission_id'] = metadata["submission_id"]
    units[participant.thing_id]['assignment_datetime'] = participant.created_at
    units[participant.thing_id]['message_status'] = metadata['message_status']

### Add Comments to Dict of Participants

In [None]:
participant_comments = sorted(participant_comments, key = lambda x: x.created_utc)
for comment in participant_comments:
    user_id = comment.user_id
    units[user_id]['comments'].append(comment)

# Load and Merge Survey Results

In [None]:
data_dir = "/home/civilservant/Tresors/CivilServant/projects/CivilServant-reddit/r-feminism-2018" ## UPDATE BEFORE RUNNING
survey_rows = {}

with open(os.path.join(data_dir, "feminism-post-survey-downloaded-05.12.2020.csv")) as f:
    for row in csv.DictReader(f):
        survey_rows[row['the reddit username you used to comment in r/feminism']] = row

In [None]:
survey_colnames = {
   "Did you identify as a feminist at the time you made your first comment in r/feminism?": "identify_feminist",
   'Select which number, corresponding to the images above, best describes your relationship with r/feminism.': "community_closeness",
   'If you have any comments or thoughts related to the sub, you may share them with us here:': "sub_comments",
   "Timestamp": "timestamp"
}

### How Many Surveys Overlap with Participants in the Survey?

In [None]:
print("{0} total surveys".format(len(list(survey_rows.keys()))))

matched_usernames = 0
for username in set(survey_rows.keys()):
    if username in list(units.keys()):
        matched_usernames += 1

print("{0} surveys match to unique participant usernames".format(matched_usernames))

### Merge Surveys with Observational Data

In [None]:
for username, unit in units.items():
    if username in survey_rows.keys():
        survey = survey_rows[username]
        unit['completed_survey'] = True
        for key,value in survey_colnames.items():
            unit[value] = survey[key]
    else:
        unit['completed_survey'] = False
        for key, value in survey_colnames.items():
            unit[value] = None

# Count the Number of Days that an account was banned or muted during the observation period


In [None]:
for mod_action in participant_modlog:
    ## used to include mute actions, 'muteuser', 'unmuteuser'
    ## but these turned out not to be useful for the study
    ## since they don't prevent someone from posting comments
    ## and were not specified in the pre-analysis plan
    if mod_action.action in ['banuser', 'unbanuser']:
        username = mod_action.target_author
        units[username]['ban_actions'].append(mod_action)
    if mod_action.action in ['removecomment', 'approvecomment']:
        units[username]['comment_actions'].append(mod_action)
    
Counter([len(x['ban_actions']) for x in units.values()])

In [None]:
# an account is considered banned or muted during
# a given day if they were banned or muted at all
# during that 24 hour period

#test_units = [x for x in units.values() if len(x['ban_actions'])>0]

for unit in units.values(): #units.values():
    ban_status = False
    ban_datetime = None
    next_ban_pointer = 0
    
    observation_days = [0] * 8*7
    
    ## ASSIGN OBSERVATION DAYS
    for i in list(range(0, len(observation_days))):
        obs_date = unit['assignment_datetime'] + datetime.timedelta(days=i)

        if(next_ban_pointer < len(unit['ban_actions']) and 
           unit['ban_actions'][next_ban_pointer].created_utc <= obs_date):
            
            ban_action = unit['ban_actions'][next_ban_pointer]
            
            ## these previously included bans and mutes, but 
            ## mutes are not relevant for this study
            if(ban_action.action in ['banuser']):
                ban_status = True
            elif(ban_action.action in ['unbanuser']):
                ban_status = False
            
            next_ban_pointer += 1
            
        observation_days[i] = ban_status
    
    unit['ban_observation_days'] = observation_days
    
    ## RESET BAN DAY COUNTS
    for weeks in [2,3,4,5,6,7,8]:
        key = "ban_days_{0}_weeks".format(weeks)
        unit[key] = 0
        
    ## AGGREGATE OBSERVATION DAYS INTO COUNTS FOR EACH WEEK DURATION
    for i in list(range(0, len(observation_days))):
        for weeks in [2,3,4,5,6,7,8]:
            if i < weeks*7 and observation_days[i] == True:
                key = "ban_days_{0}_weeks".format(weeks)
                unit[key] += 1

## Identify comments removed in the study and supplement those comments from PushShift

In [None]:
def getPScomments(ids):
    url = "https://api.pushshift.io/reddit/search/comment/?ids={0}".format(
    ",".join(ids)
    )
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

In [None]:
all_comment_actions = []
all_comment_ids = ["t1_{0}".format(x.id) for x in participant_comments]
max_week = 10

for identifier, unit in units.items():
    assignment_datetime = unit['assignment_datetime']
    for action in unit['comment_actions']:
        if(action.created_utc > assignment_datetime and
           action.created_utc <= assignment_datetime + datetime.timedelta(days = max_week*7)):
            all_comment_actions.append(action)
print("{0} comment actions to query from PushShift".format(len(all_comment_actions)))

In [None]:
all_modded_ids = list(set([x.target_fullname for x in all_comment_actions]))
all_modded_id_usernames = {}
for action in all_comment_actions:
    all_modded_id_usernames[action.target_fullname.replace("t1_", "")] = action.target_author
    
all_unique_ids = [x for x in all_modded_ids if x not in all_comment_ids]

page_size = 500
courtesy_delay = 0.25

head = 0
tail = page_size

retrieved_comments = defaultdict(list)

while(head <= len(all_unique_ids)):
    sys.stdout.write(".")
    sys.stdout.flush()
    ids = all_unique_ids[head:tail]
    if(len(ids)>0):
        comments = getPScomments(ids)
        for comment in comments:
            comment['created'] = datetime.datetime.fromtimestamp(comment['created_utc'])
            retrieved_comments[comment['id']] = comment
    time.sleep(courtesy_delay)
    head += page_size
    tail += page_size

In [None]:
print("Attempted to retrieve {0} comments".format(len(all_modded_ids)))
print("{0} comments retrieved successfully (missing comments were likely permanently removed by reddit)".format(len(retrieved_comments)))

## Merge in IDs associated with moderation actions and count the number of comments per week

In [None]:
## encode the JSON objects as python objects for
## merging with the database comment objects
for identifier, comment in retrieved_comments.items():
    username = all_modded_id_usernames[identifier]
    comment['created_utc'] = datetime.datetime.utcfromtimestamp(comment['created_utc'])
    units[username]['comments'].append(Dict2Obj(comment))

In [None]:
#Counter([len(x['comments']) for x in units.values()])
for unit in units.values():
    two_weeks = unit['assignment_datetime'] + datetime.timedelta(days=7*2)
    three_weeks = unit['assignment_datetime'] + datetime.timedelta(days=7*3)
    four_weeks = unit['assignment_datetime'] + datetime.timedelta(days=7*4)
    five_weeks = unit['assignment_datetime'] + datetime.timedelta(days=7*5)
    six_weeks = unit['assignment_datetime'] + datetime.timedelta(days=7*6)
    seven_weeks = unit['assignment_datetime'] + datetime.timedelta(days=7*7)
    eight_weeks = unit['assignment_datetime'] + datetime.timedelta(days=7*8)
    nine_weeks = unit['assignment_datetime'] + datetime.timedelta(days=7*9)
    ten_weeks = unit['assignment_datetime'] + datetime.timedelta(days=7*10)
    unit['comments_2_weeks'] = len([x for x in unit['comments'] if x.created_utc < two_weeks])
    unit['comments_3_weeks'] = len([x for x in unit['comments'] if x.created_utc < three_weeks])
    unit['comments_4_weeks'] = len([x for x in unit['comments'] if x.created_utc < four_weeks])
    unit['comments_5_weeks'] = len([x for x in unit['comments'] if x.created_utc < five_weeks])
    unit['comments_6_weeks'] = len([x for x in unit['comments'] if x.created_utc < six_weeks])
    unit['comments_7_weeks'] = len([x for x in unit['comments'] if x.created_utc < seven_weeks])
    unit['comments_8_weeks'] = len([x for x in unit['comments'] if x.created_utc < eight_weeks])
    unit['comments_9_weeks'] = len([x for x in unit['comments'] if x.created_utc < nine_weeks]) 
    unit['comments_10_weeks'] = len([x for x in unit['comments'] if x.created_utc < ten_weeks])   

### Record the number of comments removed during the study

In [None]:
for identifier, unit in units.items():
    assignment_datetime = unit['assignment_datetime']
    week_counter = 1
    week_datetime = assignment_datetime + datetime.timedelta( days= week_counter*7)
    
    unit_comment_ids = ["t1_{0}".format(x.id) for x in unit['comments']]
        
    ## create columns for week periods
    for i in list(range(max_week+1)):
        unit["removed_comments_{0}_weeks".format(i)] = 0
    
    ## create canonical list of comments associated with mod actions
    comments_removed = defaultdict(list)
    for action in unit['comment_actions']:
        if(action.created_utc > assignment_datetime and
           action.created_utc <= assignment_datetime + datetime.timedelta(days = max_week*7)):
            comments_removed[action.target_fullname].append(action)
            all_comment_actions.append(action)
        
    ## create count of removed comments
    for actions in comments_removed.values():
        if(actions[-1].action =='removecomment'):
            action = actions[0]
            comment_week = int((action.created_utc - assignment_datetime).days/7)+1
            for i in list(range(2, max_week+1)):            
                if comment_week <= i:
                    unit['removed_comments_{0}_weeks'.format(i)] += 1

# Create Account Mapping between usernames and unique IDs before outputting data

In [None]:
account_mapping = {}
for username in units.keys():
    account_mapping[username] = {
        "username": username,
        "uuid": uuid.uuid4()
    }

In [None]:
for username, unit in units.items():
    unit['id'] = str(account_mapping[username]['uuid'])
    unit['ban_observation_days'] = None
    unit['comments'] = None
    unit['ban_actions'] = None
    unit['username']  = None
    unit['comment_id'] = None
    unit['comment_actions'] = None

# Write to Files

In [None]:
pd.DataFrame(list(account_mapping.values())).to_csv(
    os.path.join(data_dir, "r-feminism-account-mapping-{0}.csv".format(datetime.datetime.utcnow().strftime('%m.%d.%Y'))))

In [None]:
pd.DataFrame(list(units.values())).to_csv(
    os.path.join(data_dir, "r-feminism-study-data-merged-{0}.csv".format(datetime.datetime.utcnow().strftime('%m.%d.%Y'))))