In [1]:
import json
import gzip
import bz2
import lzma
import glob
from os.path import basename
from  collections import *
from tqdm.notebook import tqdm
import ast
import networkx as nx
import random
import pickle
from collections import Counter

# Construct the weighted bipartite graph from user to subreddit

### Filter Out Small Subreddits

In [4]:
# Get total submissions by subreddit
files = glob.glob('/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/2018*.tsv')
sub_frequencies = Counter()

for fname in tqdm(files, desc='Processing all files'):
    with open(fname, 'rt') as f:
        lines = f.readlines()

    for line in tqdm(lines, position=1, desc='Build graph from file'):
        user, subreddit, freq = line[:-1].split('\t')
        subreddit = 'r/' + subreddit
        sub_frequencies[subreddit] += int(freq)
    
    
# Filter to subreddits with more than 500
min_sub_threshold = 500
filtered_subreddits = {sub: count for sub, count in sub_frequencies.items() if count >= min_sub_threshold}
print("# of subreddits: " + str(len(sub_frequencies)))
print("# of subreddits with 500+ submissions: " + str(len(filtered_subreddits)))

with open('/shared/0/projects/reddit-political-affiliation/data/2018_filtered_subs.tav', 'w') as f:
    for s, c in filtered_subreddits.items():
        f.write('%s\t%d\n' % (s, c))

# of subreddits: 1668578
# of subreddits with 500+ submissions: 35927


### Build the graph

In [8]:
files = glob.glob('/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/2018*.tsv')

g = nx.Graph()
users = set()
subreddits = set()

for fname in tqdm(files, desc='Processing all files'):
    with open(fname, 'rt') as f:
        lines = f.readlines()

    for line in tqdm(lines, position=1, desc='Build graph from file'):
        user, subreddit, freq = line[:-1].split('\t')
        freq = int(freq)
        
        # distinguish users from subreddits
        subreddit = 'r/' + subreddit

        if subreddit in filtered_subreddits:
            users.add(user)
            subreddits.add(subreddit)

            if not g.has_node(user):
                g.add_node(user)
            if not g.has_node(subreddit):
                g.add_node(subreddit)

            if g.has_edge(user, subreddit):
                g[user][subreddit]['weight'] += freq
            else:
                g.add_edge(user, subreddit, weight=freq)

all_nodes = users | subreddits


def filter_single_edge_nodes(g):
    out_degree = g.degree()
    to_remove = [n for (n, deg) in g.degree() if deg == 1]
    g.remove_nodes_from(to_remove)
    return g


g = filter_single_edge_nodes(g)

### Store the graph for later use

In [11]:
# Save the graph as an object instead of rebuilding it
out_dir = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/'

with open(out_dir + '2018_graph_filtered.pickle', 'wb') as handle:
    pickle.dump(g, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
# Load in previously generated graph
input_file = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/2018_graph_filtered.pickle'

with open(input_file, 'rb') as handle:
    g = pickle.load(handle)

# Load in the users flair-based labels

In [12]:
files = glob.glob('/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/20*.tsv')

user_to_politic_counts = defaultdict(Counter)

for fname in tqdm(files):
    with open(fname, 'rt') as f:
        for line in f:
            user, politics, freq = line.split('\t')
            user_to_politic_counts[user][politics] += int(freq)
            
print("User to politic counts: " + len(user_to_politic_counts))
print(list(user_to_politic_counts.items())[:10])

HBox(children=(FloatProgress(value=0.0, max=164.0), HTML(value='')))




TypeError: can only concatenate str (not "int") to str

# Canonicalize each user to a single affiliation; toss out those with more than one

In [13]:
user_to_politics = {}
for u, pc in user_to_politic_counts.items():
    if len(pc) > 1:
        continue
    user_to_politics[u] = list(pc.keys())[0]
print('Saw political affiliations for %d users' % len(user_to_politics))

Saw political affiliations for 7775 users


# Split in to test/train sets

In [14]:
all_identified_users = list(user_to_politics.keys())
random.seed(42)
random.shuffle(all_identified_users)

train_users = all_identified_users[:int(0.9*len(all_identified_users))]
test_users = all_identified_users[int(0.9*len(all_identified_users)):]
print(len(train_users), len(test_users))

6997 778


# Train personalized page rank for both parties

In [15]:
rep_users = set([k for k in train_users if user_to_politics[k] == 'Republican'])
dem_users = set([k for k in train_users if user_to_politics[k] == 'Democrat'])

print(len(rep_users), len(dem_users))

5018 1979


In [8]:
n = g.number_of_nodes()
rep_personalization = {}

for n in tqdm(g.nodes, total=n):
    if n in rep_users:
        rep_personalization[n] = 1
    else:
        rep_personalization[n] = 0.00001

HBox(children=(FloatProgress(value=0.0, max=19827115.0), HTML(value='')))




In [8]:
n = g.number_of_nodes()
dem_personalization = {}

for n in tqdm(g.nodes, total=n):
    if n in dem_users:
        dem_personalization[n] = 1
    else:
        dem_personalization[n] = 0.00001

HBox(children=(FloatProgress(value=0.0, max=19827115.0), HTML(value='')))




In [None]:
rep_ppr = nx.pagerank(g, personalization=rep_personalization)
out_dir = '/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/'

with open(out_dir + 'rep_ppr_2018.pickle', 'wb') as handle:
    pickle.dump(rep_ppr, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
dem_ppr = nx.pagerank(g, personalization=dem_personalization)
out_dir = '/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/'

with open(out_dir + 'dem_ppr_2018.pickle', 'wb') as handle:
    pickle.dump(dem_ppr, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import itertools
import pickle

sorted_dem_ppr = {k: v for k, v in sorted(dem_ppr.items(), key=lambda item: item[1], reverse=True)}
sorted_rep_ppr = {k: v for k, v in sorted(rep_ppr.items(), key=lambda item: item[1], reverse=True)}

In [None]:
top_dem = dict(itertools.islice(sorted_dem_ppr.items(), 25))
top_rep = dict(itertools.islice(sorted_rep_ppr.items(), 25))

print("Top 25 DEM PPR")
print(top_dem)
print("Top 25 REP PPR")
print(top_rep)

# Save the PPR's to a file so I don't have to recompute later
out_dir = '/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/'
with open(out_dir + 'rep_ppr_2015.pickle', 'wb') as handle:
    pickle.dump(sorted_rep_ppr, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(out_dir + 'dem_ppr_2015.pickle', 'wb') as handle:
    pickle.dump(sorted_dem_ppr, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
# Load the PPRs back in

out_dir = '/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/'
with open(out_dir + 'dem_ppr_2018.pickle', 'rb') as handle:
    dem_ppr = pickle.load(handle)
    
print(out_dir)
with open(out_dir + 'rep_ppr_2018.pickle', 'rb') as handle:
    rep_ppr = pickle.load(handle)

/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/


# Null Model

1. Randomly select the same number of users
2. Compute PPR
3. Repeat ~30 times to get an estimate of the mean/standard-deviation
4. Calculate the z-score using the actual PPR and the estimated mean/stddev

Note - If there are k republican users, just pick k random users, e.g. random.sample(all_users, k) and use those as the “seed” users in the PPR

# Score Distributions

### Read in the null models

In [17]:
# Read in all of the null models and store the results
score_dist = defaultdict(list)
directory = '/shared/0/projects/reddit-political-affiliation/data/ppr-scores/2018*.tsv'

files = glob.glob(directory)

for fname in tqdm(files, desc='Processing all files'):
    with open(fname, 'rt') as f:
        lines = f.readlines()

    for line in tqdm(lines, position=1, desc='Reading user scores in from file'):
        user, score = line[:-1].split('\t')
        score_dist[user].append(score)

HBox(children=(FloatProgress(value=0.0, description='Processing all files', max=25.0, style=ProgressStyle(desc…

HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Reading user scores in from file', max=19827115.0, style=…





## Top Political Subreddits

In [27]:
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Find the top PPR score (avg) and plot a distribution as a sanity check
print("Computing mean and variance for null models")
score_avg_dist = {}
score_var = {}
smoothing = 0.00000
for user, scores in tqdm(score_dist.items(), desc='Processing null models'):
    scores = np.array(scores, dtype=float)
    score_avg_dist[user] = np.mean(scores)
    score_var[user] = np.var(scores) + smoothing

In [28]:
norm_dem_ppr, norm_rep_ppr = {}, {}
sub_rep_ppr = {k: v for k, v in rep_ppr.items() if k[:2] == 'r/' and k[2:4] != 'u_'}
sub_dem_ppr = {k: v for k, v in dem_ppr.items() if k[:2] == 'r/' and k[2:4] != 'u_'}

for sub, score in tqdm(sub_dem_ppr.items(), desc='Processing democratic PPR scores'):
    if score_var[sub] == 0:
        continue
    norm_dem_ppr[sub] = (score / score_avg_dist[sub]) / score_var[sub]
    
for sub, score in tqdm(sub_rep_ppr.items(), desc='Processing republican PPR scores'):
    if score_var[sub] == 0:
        continue
    norm_rep_ppr[sub] = (score / score_avg_dist[sub]) / score_var[sub]
    
    
print("Sorting lists to the top political subreddits")
sub_dem_ppr = {k: v for k, v in sorted(norm_dem_ppr.items(), key=lambda item: item[1], reverse=True)}
sub_rep_ppr = {k: v for k, v in sorted(norm_rep_ppr.items(), key=lambda item: item[1], reverse=True)}

top_dem = dict(itertools.islice(sub_dem_ppr.items(), 100))
top_rep = dict(itertools.islice(sub_rep_ppr.items(), 100))

print("Top DEM Subreddits")
print(top_dem)

print("Top REP Subreddits")
print(top_rep)

Top DEM Scores (Normalized)
{'r/RoyHodgson': 2.069480975300759e+41, 'r/Filter_Porn': 2.069480975300759e+41, 'r/LibtardsofReddit': 2.069480975300759e+41, 'r/UnderscoreTest': 2.069480975300759e+41, 'r/Ethernext': 2.069480975300759e+41, 'r/WriteTheElderly': 2.069480975300759e+41, 'r/BeePuncherAnon': 2.069480975300759e+41, 'r/HonestRating': 2.069480975300759e+41, 'r/wiizerdofwiierd': 2.069480975300759e+41, 'r/GamersAscent': 2.069480975300759e+41, 'r/TravisMartinTV': 2.069480975300759e+41, 'r/MainStreetMedia': 2.069480975300759e+41, 'r/Stoner_Girls': 2.069480975300759e+41, 'r/VALAR_M0RGHUL1S': 2.069480975300759e+41, 'r/ZistanceTestSub': 2.069480975300759e+41, 'r/Bloodwolf': 2.069480975300759e+41, 'r/OhLongJohnson': 2.069480975300759e+41, 'r/mgo3teams': 2.069480975300759e+41, 'r/DogsWithWheels': 2.069480975300759e+41, 'r/HiScoreGirl': 2.069480975300759e+41, 'r/6Respect': 2.069480975300759e+41, 'r/notfibular': 2.069480975300759e+41, 'r/shittynewsroom': 2.069480975300759e+41, 'r/workmanship': 

# Politicalness Score

Sum the rep_ppr and dem_ppr for a user/subreddit and then normalize across all users and (separately) all subreddits to get a score of "politicalness". E.g., subreddits that have a higher score end up being visited more frequently by politically-active users

Correct bias using null model

In [11]:
# Separate user's and subreddits for this analysis
users_rep_ppr = {k: v for k, v in rep_ppr.items() if k[:2] != 'r/'}
users_dem_ppr = {k: v for k, v in dem_ppr.items() if k[:2] != 'r/'}

sub_rep_ppr = {k: v for k, v in rep_ppr.items() if k[:2] == 'r/'}
sub_dem_ppr = {k: v for k, v in dem_ppr.items() if k[:2] == 'r/'}

users_total_ppr = sum(users_rep_ppr.values()) + sum(users_dem_ppr.values())
subs_total_ppr = sum(sub_rep_ppr.values()) + sum(sub_dem_ppr.values())

print(users_total_ppr)
print(subs_total_ppr)

0.3459368045732271
1.6540631955111813


# Predicting on the test set

Test whether the "political leaning" score predicts the affiliations for the users in the `test_users` set by seeing which affiliation (from the PPR score) is larger.

In [None]:
from sklearn.metrics import accuracy_score

def get_prediction(user):
    if user not in rep_ppr:
        return "Republican"
    score = rep_ppr[user] - dem_ppr[user]
    if score > 0:
        return "Republican"
    else:
        return "Democrat"
    
actual, predictions = [], []

for user in test_users:
    actual.append(user_to_politics[user])
    predictions.append(get_prediction(user))
    
print(accuracy_score(actual, predictions))