In [1]:
import json
import gzip
import bz2
import lzma
import glob
from os.path import basename
from  collections import *
from tqdm.notebook import tqdm
import ast
import networkx as nx
import random

# Construct the weighted bipartite graph from user to subreddit

In [None]:
files = glob.glob('/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/2015*.tsv')

g = nx.Graph()

users = set()
subreddits = set()

for fname in tqdm(files, desc='Processing all files'):
    with open(fname, 'rt') as f:
        lines = f.readlines()

    for line in tqdm(lines, position=1, desc='Build graph from file'):
        user, subreddit, freq = line[:-1].split('\t')
        freq = int(freq)
        
        # distinguish users from subreddits
        subreddit = 'r/' + subreddit

        users.add(user)
        subreddits.add(subreddit)
        
        if not g.has_node(user):
            g.add_node(user)
        if not g.has_node(subreddit):
            g.add_node(subreddit)

        if g.has_edge(user, subreddit):
            g[user][subreddit]['weight'] += freq
        else:
            g.add_edge(user, subreddit, weight=freq)
            
all_nodes = users | subreddits

HBox(children=(FloatProgress(value=0.0, description='Processing all files', max=12.0, style=ProgressStyle(desc…

HBox(children=(FloatProgress(value=0.0, description='Build graph from file', max=10439277.0, style=ProgressSty…




HBox(children=(FloatProgress(value=0.0, description='Build graph from file', max=11515693.0, style=ProgressSty…




HBox(children=(FloatProgress(value=0.0, description='Build graph from file', max=11313519.0, style=ProgressSty…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Build graph from file', max=1.0, style=…




HBox(children=(FloatProgress(value=0.0, description='Build graph from file', max=11104185.0, style=ProgressSty…




HBox(children=(FloatProgress(value=0.0, description='Build graph from file', max=11388946.0, style=ProgressSty…




HBox(children=(FloatProgress(value=0.0, description='Build graph from file', max=12196146.0, style=ProgressSty…




HBox(children=(FloatProgress(value=0.0, description='Build graph from file', max=11325896.0, style=ProgressSty…




HBox(children=(FloatProgress(value=0.0, description='Build graph from file', max=12122977.0, style=ProgressSty…




HBox(children=(FloatProgress(value=0.0, description='Build graph from file', max=12038587.0, style=ProgressSty…




HBox(children=(FloatProgress(value=0.0, description='Build graph from file', max=11163185.0, style=ProgressSty…




HBox(children=(FloatProgress(value=0.0, description='Build graph from file', max=12112936.0, style=ProgressSty…

# Load in the users flair-based labels

In [None]:
files = glob.glob('/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/20*.tsv')

user_to_politic_counts = defaultdict(Counter)

for fname in tqdm(files):
    with open(fname, 'rt') as f:
        #lines = f.readlines()
        for line in f:
            user, politics, freq = line.split('\t')
            user_to_politic_counts[user][politics] += int(freq)
print(len(user_to_politic_counts))

In [None]:
list(user_to_politic_counts.items())[:10]

# Canonicalize each user to a single affiliation; toss out those with more than one

In [None]:
user_to_politics = {}
for u, pc in user_to_politic_counts.items():
    if len(pc) > 1:
        continue
    user_to_politics[u] = list(pc.keys())[0]
print('Saw political affiliations for %d users' % len(user_to_politics))

# Split in to test/train sets

In [None]:
all_identified_users = list(user_to_politics.keys())
random.seed(42)
random.shuffle(all_identified_users)

train_users = all_identified_users[int(0.9*len(all_identified_users)):]
test_users = all_identified_users[:int(0.9*len(all_identified_users))]

# Get the users for each party to seed PPR 

In [None]:
rep_users = set([k for k in train_users if user_to_politics[k] == 'Republican'])
dem_users = set([k for k in train_users if user_to_politics[k] == 'Democrat'])

In [None]:
# Distribute a little mass to every other node
epsilon = 0.000000001
n = g.number_of_nodes()

In [None]:
rep_personalization = {}
mass_per_rep_user = 1.0 / (len(rep_users)+ (episilon * (n - len(rep_users))))

for n in tqdm(g.nodes, total=n):
    if n in rep_users:
        rep_personalization[n] = mass_per_rep_user
    else:
        rep_personalization[n] = episilon

In [None]:
dem_personalization = {}
mass_per_dem_user = 1.0 / (len(dem_users)+ (epsilon * (n - len(dem_users))))

for n in tqdm(g.nodes, total=n):
    if n in dem_users:
        dem_personalization[n] = mass_per_dem_user
    else:
        dem_personalization[n] = episilon

In [None]:
print(len(rep_personalization))
print(len(dem_personalization))

In [None]:
rep_ppr = nx.pagerank(g, personalization=rep_personalization)

In [None]:
dem_ppr = nx.pagerank(g, personalization=dem_personalization)

# TODO:  

* try to get two scores for each user and subreddit: 
 * sum the rep_ppr and dem_ppr for a user/subreddit and then normalize across all users and (separately) all subreddits to get a score of "politicalness". E.g., subreddits that have a higher score end up being visited more frequently by politicall-active users
  * normalize the rep_ppr/dem_ppr valus _within_ a user/subreddit to get their "political leaning". We can potentially plot this relative to the "politicalness" to see which subreddits are frequented by political folks and have strong political affiliations
* Test whether the "political leaning" score predicts the affiliations for the users in the `test_users` set but seeing which affiliation (from the PPR score) is larger.