# Assign Politics From Comments


In [1]:
import re
import glob
import bz2
import lzma
import json
from json import JSONDecodeError
from collections import *

In [2]:
# Settings
year_month = '2019-04'
DEM_PATTERN = "(i am|i'm) a (democrat|liberal)|i vote[d]?( for| for a)? (democrat|hillary|biden|obama|blue)|i (hate|despise) (conservatives|republicans|trump|donald trump|mcconell|mitch mcconell)|(i am|i'm) a (former|ex) (conservative|republican)|(i am|i'm) an ex-(conservative|republican)|i (was|used to be|used to vote)( a| as a)? (conservative|republican)|fuck (conservatives|republicans|donald trump|trump|mcconell|mitch mcconell)"
REP_PATTERN = "((i am|i'm) a (conservative|republican)|i vote[d]?( for| for a)? (republican|conservative|trump|romney|mcconell)|i (hate|despise) (liberals|progressives|democrats|left-wing|biden|hillary obama)|(i am|i'm) a (former|ex) (liberal|democrat|progressive)|(i am|i'm) an ex-(liberal|democrat|progressive)|i (was|used to be|used to vote)( a| as a)? (liberal|democrat|progressive)|fuck (liberals|progressives|democrats|biden|hillary|obama))"

In [3]:
files = glob.glob('/shared/2/datasets/reddit-dump-all/RC/*.zst')
files.extend(glob.glob('/shared/2/datasets/reddit-dump-all/RC/*.xz'))
files.extend(glob.glob('/shared/2/datasets/reddit-dump-all/RC/*.bz2'))
files.extend(glob.glob('/shared/2/datasets/reddit-dump-all/RS/*.bz2'))
files.extend(glob.glob('/shared/2/datasets/reddit-dump-all/RS/*.xz'))
files

['/shared/2/datasets/reddit-dump-all/RC/RC_2019-09.zst',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2019-08.zst',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2019-07.zst',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2019-11.zst',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2019-10.zst',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2019-12.zst',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2019-06.zst',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2017-07.xz',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2017-02.xz',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2018-09.xz',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2017-06.xz',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2018-06.xz',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2017-03.xz',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2019-04.xz',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2018-05.xz',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2017-01.xz',
 '/shared/2/datasets/reddit-dump-all/RC/RC_2019-01.xz',
 '/shared/2/datasets/reddit-dump-all/RC/R

In [4]:
def parse_submissions(file_pointer):
    """ Return a users subreddits with their associated politics(s) """
    user_politics = defaultdict(list)
    for count, line in enumerate(file_pointer):
        try:
            submission = json.loads(f.readline().strip())
            username, body, subreddit = submission['author'], submission['body'], submission['subreddit']
            political_party = get_user_political_party(body)
            if political_party:
                user_politics[username].append(political_party)
                
        except (JSONDecodeError, AttributeError) as e:
            print("Failed to parse line: {} with error: {}".format(line, e))

        if count % 1000000 == 0 and count > 0:
            print("Completed %d lines" % (count))

    return user_politics

def get_file_handle(file_path):
    ext = file_path.split('.')[-1]

    if ext == "bz2":
        return bz2.open(file_path)
    elif ext == "xz":
        return lzma.open(file_path)

    raise AssertionError("Invalid extension for " + file_path + ". Expecting bz2 or xz file")
    
def get_user_political_party(text):
    if re.findall(DEM_PATTERN, text.lower()):
        return "Democrat"
    elif re.findall(REP_PATTERN, text.lower()):
        return "Republican"
    return ""


filename = '/shared/2/datasets/reddit-dump-all/RC/RC_2019-04.xz'
f = get_file_handle(filename)
user_politics = parse_submissions(f)

Completed 1000000 lines
Completed 2000000 lines
Completed 3000000 lines
Completed 4000000 lines
Completed 5000000 lines
Completed 6000000 lines
Completed 7000000 lines
Completed 8000000 lines
Completed 9000000 lines
Completed 10000000 lines
Completed 11000000 lines
Completed 12000000 lines
Completed 13000000 lines
Completed 14000000 lines
Completed 15000000 lines
Completed 16000000 lines
Completed 17000000 lines
Completed 18000000 lines
Completed 19000000 lines
Completed 20000000 lines
Completed 21000000 lines
Completed 22000000 lines
Completed 23000000 lines
Completed 24000000 lines
Completed 25000000 lines
Completed 26000000 lines
Completed 27000000 lines
Completed 28000000 lines
Completed 29000000 lines
Completed 30000000 lines
Completed 31000000 lines
Completed 32000000 lines
Completed 33000000 lines
Completed 34000000 lines
Completed 35000000 lines
Completed 36000000 lines
Completed 37000000 lines
Completed 38000000 lines
Completed 39000000 lines
Completed 40000000 lines
Completed

In [5]:
print("Found political affliations for {} users".format(len(user_politics)))

Found political affliations for 4252 users


### Filter out users who self-identify as both democrat and republican

Maybe we want to analyze these later...

In [6]:
filtered_politics = {}
dems, reps = 0,0

for user, politics_list in user_politics.items():
    if (len(set(politics_list)) == 1):
        filtered_politics[user] = politics_list[0]
        if politics_list[0] == "Democrat":
            dems += 1
        else:
            reps += 1
        
print("Number of filtered politics users: {}".format(len(filtered_politics)))
print(dems, reps)

Number of filtered politics users: 4178
2859 1319


## Save the user political affliations

Save them to a separate directory then political flairs for now

In [7]:
out_file = '/shared/0/projects/reddit-political-affiliation/data/comment-affiliations/' + year_month + '.tsv'

with open(out_file, 'w') as f:
    for user, political_party in filtered_politics.items():
        f.write("{}\t{}\n".format(user, political_party))