# r/ffxiv Experiment Dataframe Creation
March 2020

J. Nathan Matias and Eric Pennington

### Libraries and environment

In [26]:
%matplotlib inline
import inspect, os, sys, copy, pytz, re, glob, csv, uuid, datetime
os.environ['AIRBRAKE_API_KEY'] = "ca826dbd1a4594241c239bba825edd9f" ## EDIT BEFORER USING
os.environ['AIRBRAKE_PROJECT_ID'] = "-1" ## EDIT BEFORE USING

import simplejson as json
import pandas as pd
from dateutil import parser
import datetime
import matplotlib.pyplot as plt   # Matplotlib for plotting
import matplotlib.dates as md
from collections import Counter, defaultdict
utc=pytz.UTC

ENV = "production"
os.environ['CS_ENV'] = 'production'
BASE_DIR = "/usr/local/civilservant/platform"
sys.path.append(BASE_DIR)

with open(os.path.join(BASE_DIR, "config") + "/{env}.json".format(env=ENV), "r") as config:
  DBCONFIG = json.loads(config.read())

### LOAD SQLALCHEMY
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy import text, and_, or_
import sqlalchemy.orm.session
import utils.common


db_engine = create_engine("mysql://{user}:{password}@{host}/{database}".format(
    host = DBCONFIG['host'],
    user = DBCONFIG['user'],
    password = DBCONFIG['password'],
    database = DBCONFIG['database']))
DBSession = sessionmaker(bind=db_engine)
db_session = DBSession()


### LOAD PRAW
#import reddit.connection
#conn = reddit.connection.Connect(base_dir=BASE_DIR, env="jupyter")

from app.models import *

### FILTER OUT DEPRECATION WARNINGS ASSOCIATED WITH DECORATORS
# https://github.com/ipython/ipython/issues/9242
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning, message='.*use @default decorator instead.*')

# Global Experiment Settings

In [23]:
subreddit_id = "2rgs7"

# Create a Dataset of Posts Appearing In the Experiment

In [15]:
experiment_posts = []
for row in db_engine.execute(text("""
select * from experiment_things 
  JOIN posts on experiment_things.id = posts.id 
  WHERE object_type=1 
    AND (experiment_id=15 OR experiment_id=16)
  ORDER BY posts.created ASC;

""")):
    post = {}
    for key in row.keys():
        post[key]=row[key]
    metadata = json.loads(row['metadata_json'])
    for key in metadata['randomization'].keys():
        post[key] = metadata['randomization'][key]
    post['treat.number'] = int(post[''])
    del post['']
    del post['metadata_json']
    post['post_data']  = json.loads(post['post_data'])
    experiment_posts.append(post)

In [34]:
print("{0} posts have been assigned to treatment or control in the experiment.".format(len(experiment_posts)))
earliest_date = experiment_posts[0]['created']
latest_date = experiment_posts[-1]['created']
print("Earliest Date: {0}".format(earliest_date))
print("Latest Date: {0}".format(latest_date))

12100 posts have been assigned to treatment or control in the experiment.
Earliest Date: 2019-07-06 12:26:30
Latest Date: 2019-08-24 19:06:05


### Load Moderator Actions Between the Earliest Date and One Week After the Final Post

In [28]:
recent_mod_actions = []
for row in db_engine.execute(text("""
SELECT action_data FROM mod_actions 
    WHERE subreddit_id="{0}" AND 
          created_utc >= "{1}" AND
          created_utc <= "{2}"
    ORDER BY created_utc;
""".format(subreddit_id,
           earliest_date,
           latest_date +  datetime.timedelta(days=7)))):
    mod_action = json.loads(row['action_data'])
    mod_action['created'] = utc.localize(datetime.datetime.utcfromtimestamp(mod_action['created_utc']))
    recent_mod_actions.append(mod_action)
print("{0} moderator actions loaded".format(len(recent_mod_actions)))

49497 moderator actions loaded


### Tag posts as visible or non-visible based on moderation log
Also: create study_posts, which is the dict used to create the final dataframe

In [33]:
study_posts = {}
for post in experiment_posts:
    post['visible'] = True
    study_posts[post['id']] = post
recent_post_count = len(study_posts.values())
print("Post Count: {0}".format(recent_post_count))

missing_mod_actions = []
matched_mod_actions = 0
for action in recent_mod_actions:
    if action['action'] == "removelink":
        key = action['target_fullname'].replace("t3_","")
        if key in study_posts.keys():
            study_posts[key]['visible'] = False
            matched_mod_actions += 1
        else:
            missing_mod_actions.append(key)
    elif action['action'] == 'approvelink':
        key = action['target_fullname'].replace("t3_","")
        if key in study_posts.keys():
            study_posts[key]['visible'] = True
            matched_mod_actions += 1
        else:
            missing_mod_actions.append(key)
#print("Missing Mod Actions: {0}".format(len(missing_mod_actions)))
# print("Missing Mod Action Posts: {0}".format(len(set(missing_mod_actions))))
print("Matched Mod Actions: {0}".format(matched_mod_actions))

Post Count: 12100
Matched Mod Actions: 4160


# Create a Dataset of Comments In the Experiment
## Load last six months of comments from CivilServant and Baumgartner systems
Six months before 2019-07-06 12:26:30