# EDA for Controversit: a tool to identify potentially controversial submissions on reddit

This version is for exploration and comes in Jupyter notebook format for ease of use.

## Import useful packages

In [1]:
import os.path
from os import path
import ciso8601
import time
import datetime 
import requests
import json
import csv
import praw
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import warnings; warnings.simplefilter('ignore')

In [2]:
if path.exists("subreddit_data"):
    print("Directory subreddit_data exists!")
else :
    !mkdir subreddit_data
    print("Directory subreddit_data created!")

Directory subreddit_data exists!


# Crawling with PRAW: a python wrapper for the Reddit API

In [None]:
uname = "DavidWithAnE"
upassword = "BROCCO.billy85"
uagent = "controversit by /u/"+uname
cl_id = "l_eEWQRL2Qrvxw"
cl_secret = "QBAlJLF4KPd4ox5ykGTsAoM9Z_A"

reddit = praw.Reddit(client_id=cl_id,
                     client_secret=cl_secret,
                     password=upassword,
                     user_agent=uagent,
                     username=uname)

print("Logged in as user: ",reddit.user.me())

## Choose a subreddit and get a list of submissions

In [None]:
def create_pd_dframe(nrows,list_of_fields):
    '''
    A simple function to create an empty
    dataframe with a given number of rows 
    and a given list of columns
    '''
    
    shape = (nrows,len(list_of_fields))
    d = np.empty(shape)
    df = pd.DataFrame(data=d,columns=list_of_fields)
    
    return df

def submission_data_to_dframe(sorting_scheme,n_subm=100):
    '''
    A function that gets n_subm reddit submissions, 
    retrieves their properties, comments to the 
    submission, then saves the results into a pandas 
    dataframe.
    '''

    # Initialize pandas dataframe
    list_of_fields = ["subm_ID","subm_title","subm_author", \
                      "subm_created_utc","subm_upvote_ratio", \
                      "subm_link_flair_text", \
                      "subm_comment_ids", \
                      "subm_comment_authors", \
                      "subm_comment_bodies", \
                      "subm_comment_scores"]
    df = create_pd_dframe(n_subm,list_of_fields)
    df["subm_comment_ids"] = [[]] * len(df)
    df["subm_comment_authors"] = [[]] * len(df)
    df["subm_comment_bodies"] = [[]] * len(df)
    df["subm_comment_scores"] = [[]] * len(df)
        
    # Use reddit api to retrieve the necessary information.
    i = 0 
    for sub in sorting_scheme(limit=n_subm):

        # Data on submissions
        df["subm_ID"].iloc[i] = sub.id
        df["subm_title"].iloc[i] = sub.title
        df["subm_author"].iloc[i] = sub.author
        df["subm_created_utc"].iloc[i] = float(sub.created_utc)
        df["subm_upvote_ratio"].iloc[i] = float(sub.upvote_ratio)
        df["subm_link_flair_text"].iloc[i] = sub.link_flair_text
        
        # Data on comments
        sub.comments.replace_more(limit=0)
        comment_ids = []
        comment_authors = []
        comment_bodies = []
        comment_scores = []
        for com in sub.comments:
            comment_ids.append(com.id)
            comment_authors.append(com.author)
            comment_bodies.append(com.body)
            comment_scores.append(float(com.score))
        df["subm_comment_ids"].iloc[i] = comment_ids
        df["subm_comment_authors"].iloc[i] = comment_authors
        df["subm_comment_bodies"].iloc[i] = comment_bodies
        df["subm_comment_scores"].iloc[i] = comment_scores
        
        # Increase counter
        i=i+1
    
    return df

In [None]:
# Choose a subreddit and a number of submissions

subreddit_name = "politicaldiscussion"
#subreddit_name = "amitheasshole"
#subreddit_name = "changemyview"

n_subm = 1000

subreddit_obj = reddit.subreddit(subreddit_name)

subm_data = submission_data_to_dframe(subreddit_obj.new,n_subm=n_subm)

In [None]:
# Display dataframe
subm_data.head(10)

## Dump data into a pickle file

In [None]:
# Save the data
timestamp = time.time()
dati = datetime.datetime.fromtimestamp(timestamp)
da = str(dati)[0:10]
ti = str(dati)[11:19]

fname = "dump_r-"+subreddit_name+"_"+da+".pkl"
print(fname)
subm_data.to_pickle(fname)

## Make a few diagnostic plots

In [None]:
# Make an histogram of the controversial indicator 
# i.e. subm_upvote_ratio

# Load a couple of 
df0 = pd.read_pickle("dump_r-politicaldiscussion_2019-09-12.pkl")
name0 = "politicaldiscussion"
df1 = pd.read_pickle("dump_r-amitheasshole_2019-09-12.pkl")
name1 = "amitheasshole"
df2 = pd.read_pickle("dump_r-changemyview_2019-09-12.pkl")
name2 = "changemyview"

params={'font.size': 20,'axes.labelsize': 20,'legend.fontsize': 18,
        'xtick.labelsize': 20,'ytick.labelsize': 20,'lines.linewidth': 4,'axes.linewidth': 3,
        'xtick.major.width': 3,'ytick.major.width': 3,'xtick.minor.width': 3,'ytick.minor.width': 3,
        'xtick.major.size': 7,'ytick.major.size': 7,'xtick.minor.size': 5,'ytick.minor.size': 5,
        'lines.markeredgewidth' : 3, 'lines.markersize': 6}
mpl.rcParams.update(params)

fig = plt.figure(figsize=(16,8))
plt.xscale("linear")
plt.yscale("log")
plt.axis([-0.05,1.05,0.1,200])
plt.xlabel("Upvotes/(Upvotes+Downvotes)")
plt.xticks(np.arange(0,1.2,0.2),labels=["0.0","0.2","0.4","0.6","0.8","1.0"])
plt.ylabel("Number of Submissions")
plt.yticks([0.1,1,10,100],labels=["0.1","1.0","10","100"])
plt.axvspan(0.25, 0.75, facecolor='gray',alpha=0.3,label="Controversial Range")
plt.plot([0.5,0.5],[0,1e8],"k:",linewidth=5,label="Most Controversial")
histdata = [df0["subm_upvote_ratio"].values,df1["subm_upvote_ratio"].values,df2["subm_upvote_ratio"].values]

colors = ["red","lightgreen","lightblue"]
labels = ["r/"+name0,"r/"+name1,"r/"+name2]
plt.hist(histdata,bins=10,range=[0,1],rwidth=0.8,label=labels,color=colors,edgecolor='black',alpha=1)
plt.legend(loc='top left', bbox_to_anchor=(1.05, 1.0))


In [None]:
# Another interesting plot

# Load a couple of 
df0 = pd.read_pickle("dump_r-politicaldiscussion_2019-09-12.pkl")
name0 = "politicaldiscussion"
df1 = pd.read_pickle("dump_r-amitheasshole_2019-09-12.pkl")
name1 = "amitheasshole"
df2 = pd.read_pickle("dump_r-changemyview_2019-09-12.pkl")
name2 = "changemyview"

params={'font.size': 20,'axes.labelsize': 20,'legend.fontsize': 18,
        'xtick.labelsize': 20,'ytick.labelsize': 20,'lines.linewidth': 4,'axes.linewidth': 3,
        'xtick.major.width': 3,'ytick.major.width': 3,'xtick.minor.width': 3,'ytick.minor.width': 3,
        'xtick.major.size': 7,'ytick.major.size': 7,'xtick.minor.size': 5,'ytick.minor.size': 5,
        'lines.markeredgewidth' : 3, 'lines.markersize': 6}
mpl.rcParams.update(params)

sel0 = df0["subm_upvote_ratio"] <= 0.25
sel1 = df1["subm_upvote_ratio"] <= 0.25
sel2 = df2["subm_upvote_ratio"] <= 0.25
negative_opinion = np.array([ \
                    float(len(df0["subm_upvote_ratio"][sel0]))/float(len(df0["subm_upvote_ratio"])), \
                    float(len(df1["subm_upvote_ratio"][sel1]))/float(len(df1["subm_upvote_ratio"])), \
                    float(len(df2["subm_upvote_ratio"][sel0]))/float(len(df2["subm_upvote_ratio"])) \
                   ])

sel0 = df0["subm_upvote_ratio"] >= 0.75
sel1 = df1["subm_upvote_ratio"] >= 0.75
sel2 = df2["subm_upvote_ratio"] >= 0.75
positive_opinion = np.array([ \
                    float(len(df0["subm_upvote_ratio"][sel0]))/float(len(df0["subm_upvote_ratio"])), \
                    float(len(df1["subm_upvote_ratio"][sel1]))/float(len(df1["subm_upvote_ratio"])), \
                    float(len(df2["subm_upvote_ratio"][sel0]))/float(len(df2["subm_upvote_ratio"])) \
                   ])

controversial_opinion = np.array([1.0,1.0,1.0])
controversial_opinion = controversial_opinion-positive_opinion-negative_opinion

plt.figure(figsize=(16,8))
ind = [0,1,2]
p1 = plt.barh(ind,negative_opinion,height=0.5,color="red",label="Negative")
p2 = plt.barh(ind,controversial_opinion,height=0.5,color="orange",left=negative_opinion,label="Controversial")
p3 = plt.barh(ind,positive_opinion,color="green",height=0.5,\
     left=negative_opinion+controversial_opinion, label="Positive")
plt.xlabel("Fraction of Submissions")
plt.yticks(ind,["r/"+name0,"r/"+name1,"r/"+name2])
plt.legend(bbox_to_anchor=(1.05, 1.0), loc=2, borderaxespad=0.)

## A few additional tests with PRAW: run a search query

In [None]:
# A test with a different API call

#subreddit_name = "politicaldiscussion"
#subreddit_name = "amitheasshole"
subreddit_name = "changemyview"

n_subm = 1000

subreddit_obj = reddit.subreddit(subreddit_name)

i=0
for sub in subreddit_obj.search("all", sort='new', syntax='lucene', time_filter='all', limit=10):
    print(i,sub.title)
    i=i+1

# Crawling with Pushshift.io

More versatile.

## A series of functions to use the API

In [3]:
def getPushshiftData(after, before, limit, subr):
    '''
    This function queries submissions from a subreddit
    within a given time range [before,after] using 
    a Pushshift.io call.
    '''
    url = "https://api.pushshift.io/reddit/search/submission/?size="+ \
        str(limit)+"&after="+str(after)+"&before="+str(before)+"&subreddit="+str(subr)
    r = requests.get(url)
    data = json.loads(r.text)
    return data["data"]

In [4]:
def getExtra(sub_id,useragent):
    '''
    Given a submission sub_id, get extra data
    that is not provided by Pushshift.io, e.g.
    score and upvote ratio.
    '''
    searchURL = 'http://reddit.com/'
    url = searchURL + str(sub_id) + '.json'
    r = requests.get(url, headers = {'User-agent': useragent})
    extra_data = json.loads(r.text)
    out0 = extra_data[0]["data"]["children"][0]["data"]["score"]
    out1 = extra_data[0]["data"]["children"][0]["data"]["upvote_ratio"] 
    return out0,out1

In [5]:
def collectSubData(subDic,subm,subr,useragent):
    '''
    This function collects data on submissions to a 
    given subreddit and saves them in a dictionary
    '''
    subDic["subreddit"].append(subr)
    subDic["sub_id"].append(subm["id"])
    subDic["title"].append(subm["title"])
    subDic["author"].append(subm["author"])
    dt = str(datetime.datetime.fromtimestamp(subm["created_utc"]))
    subDic["created"].append(dt)
    subDic["url"].append(subm["url"])
    subDic["permalink"].append(subm["permalink"])
    try:
        subDic["flair"].append(subm["link_flair_text"])
    except KeyError:
        subDic["flair"].append("NaN")
    subDic["numComms"].append(subm["num_comments"])
    try:
        subDic["selftext"].append(subm["selftext"])
    except KeyError:    
        subDic["selftext"].append("NaN")
    try:
        score,upvote_ratio = getExtra(subm["id"],useragent)
        subDic["upvote_ratio"].append(upvote_ratio)
        subDic["score"].append(score)
    except:
        subDic["upvote_ratio"].append("Nan")
        subDic["score"].append(subm["score"])

    return subDic

In [6]:
def runQuery(subCount, subDic,after, before, limit, subr, useragent):
    '''
    This function calls the appropriate functions to 
    run a Pushshift.io query to get submissions in 
    a given subreddit, then calls saveQueryData to 
    save data to disk in pickle format using pandas.
    '''
    # Run query
    data = getPushshiftData(after, before, limit, subr)
    while len(data) > 0:
        for submission in data:
            subDic = collectSubData(subDic,submission,subr,useragent)
            subCount+=1
        after = data[-1]["created_utc"]
        data = getPushshiftData(after, before, limit, subr)
    
    df = pd.DataFrame(data=subDic)
    return df

In [7]:
def saveDFrame(df,subr):
    outpath = "./subreddit_data/"
    fname = outpath+"r-"+subr+"-export.pkl"
    df.to_pickle(fname)

In [8]:
def MPQueryWrapper(after,before,limit,subr,useragent):
    '''
    This functions wrap API calls for use with the 
    multiprocessing package. With this function 
    multiple queries can be run in parallel. 
    '''
    # Fields that will be used
    fields = ["subreddit","sub_id","title","author", \
          "created","url","permalink","score", \
          "numComms","flair","selftext","upvote_ratio"]
    subCount = 0
    subDic = {}
    for f in fields:
        subDic[f] = [] 
    print(after,before,limit,subr,useragent)
    df = runQuery(subCount,subDic,after,before,limit,subr,useragent)
    
    saveDFrame(df,subr)
    
    return(df)

## Choose list of subreddits, a time range and a list of query parameters

In [10]:
# Useragent
useragent = "DavidWithAnE"

# Subreddit selection
subr_list = ["comsci","politicaldiscussion", \
             "quotes","history"]

# Max number of submissions per query
limit = 10000

# Time selection based on timestamps
# This currently selects all posts
#dateini = "2019-01-01" # Fiducial for advice, changemyview, confession, parenting
dateini = "2010-01-01"
dateend = "2020-01-01"
tsi = ciso8601.parse_datetime(dateini)
tsf = ciso8601.parse_datetime(dateend)
# to get time in seconds:
after = int(time.mktime(tsi.timetuple()))
before = int(time.mktime(tsf.timetuple()))

query_params = []
for i in range(0,len(subr_list)):
    query_params.append((after,before,limit,subr_list[i],useragent))

## Call API and run queries

In [None]:
# Run on multiple cores
from itertools import starmap
import multiprocessing as mp

ncores = min(mp.cpu_count(),len(subr_list))
worker_pool = mp.Pool(ncores)

out = []
for df in worker_pool.starmap(MPQueryWrapper, query_params):
    out.append(df)
        
#Close threads
worker_pool.close()
worker_pool.join()

display(out[0])

1262332800 1577865600 10000 comsci DavidWithAnE
1262332800 1577865600 10000 politicaldiscussion DavidWithAnE
1262332800 1577865600 10000 quotes DavidWithAnE
1262332800 1577865600 10000 history DavidWithAnE


## Alternative setup for very large subreddits: split time range in chunks

In [35]:
# Useragent
useragent = "DavidWithAnE"

# Subreddit selection
subrname = "quotes"

# Max number of submissions per query
limit = 10000

# Time selection based on timestamps
# This currently selects all posts
dateini = "2019-01-01"
dateend = "2020-09-30"
tsi = ciso8601.parse_datetime(dateini)
tsf = ciso8601.parse_datetime(dateend)
# to get time in seconds:
dateini = int(time.mktime(tsi.timetuple()))
dateend = int(time.mktime(tsf.timetuple()))

nchunks = 8
dt = int((dateend-dateini)/nchunks)
query_params = []
for i in range(0,nchunks):
    after = dateini+int(i*dt)
    if i == nchunks-1 :
        before = dateend
    else :
        before = dateini+int((i+1)*dt)
    query_params.append((after,before,limit,subrname,useragent))

## Run API query on the time chunks

In [36]:
# Run on multiple cores
from itertools import starmap
import multiprocessing as mp

ncores = min(mp.cpu_count(),nchunks)
worker_pool = mp.Pool(ncores)

worker_pool.starmap(MPQueryWrapper, query_params):
        
#Close threads
worker_pool.close()
worker_pool.join()

1546329600 1553219550 10000 quotes DavidWithAnE
1553219550 1560109500 10000 quotes DavidWithAnE
1560109500 1566999450 10000 quotes DavidWithAnE
1566999450 1573889400 10000 quotes DavidWithAnE
1573889400 1580779350 10000 quotes DavidWithAnE
1587669300 1594559250 10000 quotes DavidWithAnE
1580779350 1587669300 10000 quotes DavidWithAnE
1594559250 1601449200 10000 quotes DavidWithAnE


Process ForkPoolWorker-71:
Process ForkPoolWorker-68:
Process ForkPoolWorker-69:
Process ForkPoolWorker-67:
Process ForkPoolWorker-66:
Process ForkPoolWorker-70:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/dmartizzi/anaconda2/envs/base-py3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/dmartizzi/anaconda2/envs/base-py3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/dmartizzi/anaconda2/envs/base-py3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/dmartizzi/anaconda2/envs/base-py3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/dmartizzi/anaconda2/envs/base-py3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/dmartizzi/anacond

KeyboardInterrupt: 

Traceback (most recent call last):
  File "/Users/dmartizzi/anaconda2/envs/base-py3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/dmartizzi/anaconda2/envs/base-py3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/dmartizzi/anaconda2/envs/base-py3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/Users/dmartizzi/anaconda2/envs/base-py3/lib/python3.7/multiprocessing/queues.py", line 352, in get
    res = self._reader.recv_bytes()
  File "/Users/dmartizzi/anaconda2/envs/base-py3/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/Users/dmartizzi/anaconda2/envs/base-py3/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/Users/dmartizzi/anaconda2/envs/base-py3/lib/python3.7/multiprocessing/connection.py", line 379, in _

# Setup a new SQL database for the data 

These are prototypes and need to be coded properly.

In [None]:
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

In [None]:
# Define a database name 
# Set your postgres username
dbname = 'subreddit-db'
username = 'dmartizzi' # change this to your username

In [None]:
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)

In [None]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

In [None]:
subr_list_to_sql = ["politicaldiscussion", "history","changemyview","quotes"]
for sss in subr_list_to_sql:
    fname = "r-"+sss+"-export.pkl"
    subr_data = pd.read_pickle(fname)
    subr_data.to_sql(s, engine, if_exists='replace')