In [1]:
import json
import gzip
import bz2
import lzma
import glob
from os.path import basename
from collections import *
from tqdm.notebook import tqdm
import ast
import networkx as nx
import random
import pickle
from collections import Counter
from matplotlib import pyplot as plt 

# SETTINGS
year = '2019'


Bad key "text.kerning_factor" on line 4 in
/opt/anaconda/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


# Clean up data for the graph

### Grab known bot accounts

In [2]:
fname = '/shared/0/projects/prosocial/known-bots.tsv'
bots = []

with open(fname, 'rt') as f:
    lines = f.readlines()
    
    for line in lines:
        bots.append(line.split('\t')[1])

print("Known bots: %d" % len(bots))

Known bots: 393


### Top 10% of subreddits and users involved in 3+ subreddits

In [None]:
directory = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/' + year + '*.tsv'
files = glob.glob(directory)

subreddit_submissions = Counter()
user_subreddits = defaultdict(set)
user_post_totals = Counter()

for fname in tqdm(files, desc='Processing all files'):
        with open(fname, 'rt') as f:
            lines = f.readlines()

        for line in tqdm(lines, position=1, desc='Counting subreddit and user frequency'):
            user, subreddit, freq = line[:-1].split('\t')
            freq = int(freq)
            subreddit = 'r/' + subreddit
            subreddit_submissions[subreddit] += freq
            user_subreddits[user].add(subreddit)
            user_post_totals[user] += freq

# Grab top 10% of subreddits
total_subreddits = len(subreddit_submissions)
top_subreddits = subreddit_submissions.most_common(int(total_subreddits * .1))
print("Total # of subreddits %d" % total_subreddits)
print("Ten percent of subreddits %d" % len(top_subreddits))

# Grab users who post in 3+ subreddits
print("Total # of users %d" % len(user_subreddits))
user_subreddits = {k: v for k, v in user_subreddits.items() if len(v) >= 3}
print("Users who post in 3+ subreddits %d" % len(user_subreddits))

HBox(children=(FloatProgress(value=0.0, description='Processing all files', max=26.0, style=ProgressStyle(desc…

HBox(children=(FloatProgress(value=0.0, description='Counting subreddit and user frequency', max=30701171.0, s…




HBox(children=(FloatProgress(value=0.0, description='Counting subreddit and user frequency', max=28107483.0, s…




HBox(children=(FloatProgress(value=0.0, description='Counting subreddit and user frequency', max=99416.0, styl…




HBox(children=(FloatProgress(value=0.0, description='Counting subreddit and user frequency', max=28249465.0, s…




HBox(children=(FloatProgress(value=0.0, description='Counting subreddit and user frequency', max=31026535.0, s…




HBox(children=(FloatProgress(value=0.0, description='Counting subreddit and user frequency', max=30972308.0, s…




HBox(children=(FloatProgress(value=0.0, description='Counting subreddit and user frequency', max=28479860.0, s…




HBox(children=(FloatProgress(value=0.0, description='Counting subreddit and user frequency', max=29645265.0, s…




HBox(children=(FloatProgress(value=0.0, description='Counting subreddit and user frequency', max=27368767.0, s…

### Remove super users

In [None]:
# Remove users who have posted a large 
cutoff = 10000
user_subreddits = {k: v for k, v in user_subreddits.items() if user_post_totals[k] < cutoff}
print("Total # of users %d" % len(user_subreddits))

### Save results to a TSV

In [None]:
out_dir = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/'

with open(out_dir + 'subreddits_' + year + '.tsv', 'w') as f:
    for element, count in top_subreddits:
        f.write('%s\t%d\n' % (element, count))
       
    
with open(out_dir + 'users_' + year + '.tsv', 'w') as f:
    for user, subs in user_subreddits.items():
        f.write('%s\t%d\n' % (user, len(subs)))

### Load results back in

In [None]:
out_dir = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/'

subreddit_counts = {}
user_sub_counts = {}

with open(out_dir + 'subreddits_' + year + '.tsv', 'r') as f:
    lines = f.readlines()
    for line in lines:
        element, count = line.split("\t")
        subreddit_counts[element] = count
       
    
with open(out_dir + 'users_' + year + '.tsv', 'r') as f:
    lines = f.readlines()
    for line in lines:
        user, sub_counts = line.split("\t")
        user_sub_counts[user] = sub_counts
        
print(len(subreddit_counts))
print((user_sub_counts))

## Go through the original counts and filter out users/subs

In [None]:
print(len(user_subreddits), len(subreddit_counts))

In [None]:
# Open the input file
directory = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/' + year + '*.tsv'
files = glob.glob(directory)
files = [f for f in files if "filtered" not in f]

# Open the output
for fname in tqdm(files, desc='Processing all files'):
        print(fname[:-4] + "_filtered.tsv")
        out_file  = open(fname[:-4] + "_filtered.tsv", "wt")
        
        with open(fname, 'rt') as f:
            lines = f.readlines()
            print(len(lines))

        for line in tqdm(lines, position=1, desc='Filtering lines for the month'):
            user, subreddit, freq = line[:-1].split('\t')
            subreddit = 'r/' + subreddit
            freq = int(freq)
            # Use dictionary for O(1) lookups
            if user in user_subreddits and subreddit in subreddit_counts:
                out_file.write("%s\t%s\t%d\n" %  (user, subreddit, freq))
                
        out_file.close()

# Build the new Graph

In [None]:
def build_bipartite_graph(directory, top_subreddits, filtered_users):
    files = glob.glob(directory)
    g = nx.Graph()

    for fname in tqdm(files, desc='Processing all files'):
        with open(fname, 'rt') as f:
            lines = f.readlines()
            print(len(lines))

        for line in tqdm(lines, position=1, desc='Build graph from file'):
            user, subreddit, freq = line[:-1].split('\t')
            freq = int(freq)

            if not g.has_node(user):
                g.add_node(user)
            if not g.has_node(subreddit):
                g.add_node(subreddit)

            if g.has_edge(user, subreddit):
                g[user][subreddit]['weight'] += freq
            else:
                g.add_edge(user, subreddit, weight=freq)
    return g

directory = location = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/' + year + '*_filtered.tsv'
g = build_bipartite_graph(directory, top_subreddits, user_subreddits)
print("Total nodes: %d" % len(g.nodes()))

In [None]:
# Save the graph
out_dir = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/'
with open(out_dir + year + '_graph_filtered.pickle', 'wb') as handle:
    pickle.dump(g, handle, protocol=pickle.HIGHEST_PROTOCOL)