# Sanity Checks

In [1]:
import os 
import sys
import glob

sys.path.append('/home/kalkiek/projects/reddit-political-affiliation/')

import itertools
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from tqdm.notebook import tqdm
from collections import defaultdict, Counter
from sklearn.metrics import accuracy_score

from src.data.make_dataset import build_dataset
from src.models.word2vec.User2Subreddit import User2Subreddit

from sklearn.metrics import auc, roc_curve

## Distribution of Validation Set

In [2]:
# Load the dataset in
year_month = '2019-04'

network_path = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/' + year_month + '_filtered.tsv'
flair_directory = '/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/' + year_month + '.tsv'

dataset, training, validation, pol_validation, vocab = build_dataset(network_path, flair_directory)


Building vocab from file:   0%|          | 0/28132858 [00:00<?, ?it/s][A
Building vocab from file:   0%|          | 3259/28132858 [00:00<1:23:36, 5607.60it/s][A
Building vocab from file:   0%|          | 65103/28132858 [00:00<58:37, 7979.84it/s] [A
Building vocab from file:   0%|          | 133798/28132858 [00:00<41:08, 11343.30it/s][A
Building vocab from file:   1%|          | 167476/28132858 [00:01<31:38, 14733.08it/s][A
Building vocab from file:   1%|          | 234424/28132858 [00:01<22:18, 20850.60it/s][A
Building vocab from file:   1%|          | 297307/28132858 [00:01<15:47, 29369.22it/s][A
Building vocab from file:   1%|▏         | 364824/28132858 [00:01<11:14, 41188.18it/s][A
Building vocab from file:   2%|▏         | 429058/28132858 [00:01<08:03, 57266.50it/s][A
Building vocab from file:   2%|▏         | 498765/28132858 [00:01<05:49, 79026.87it/s][A
Building vocab from file:   2%|▏         | 563209/28132858 [00:02<04:17, 107258.50it/s][A
Building vocab from file: 

Building vocab from file:  20%|█▉        | 5610392/28132858 [00:13<01:01, 363841.75it/s][A
Building vocab from file:  20%|██        | 5676847/28132858 [00:13<00:53, 420990.31it/s][A
Building vocab from file:  20%|██        | 5746081/28132858 [00:13<00:46, 477084.91it/s][A
Building vocab from file:  21%|██        | 5811072/28132858 [00:14<00:43, 516085.76it/s][A
Building vocab from file:  21%|██        | 5877184/28132858 [00:14<00:40, 552443.52it/s][A
Building vocab from file:  21%|██        | 5943661/28132858 [00:14<00:38, 581941.55it/s][A
Building vocab from file:  21%|██▏       | 6012742/28132858 [00:14<00:36, 610818.89it/s][A
Building vocab from file:  22%|██▏       | 6079312/28132858 [00:14<00:35, 626008.38it/s][A
Building vocab from file:  22%|██▏       | 6145848/28132858 [00:14<00:34, 636554.58it/s][A
Building vocab from file:  22%|██▏       | 6212301/28132858 [00:14<00:34, 641451.61it/s][A
Building vocab from file:  22%|██▏       | 6280456/28132858 [00:14<00:33, 652976

Building vocab from file:  40%|███▉      | 11214109/28132858 [00:27<00:32, 521971.10it/s][A
Building vocab from file:  40%|████      | 11278902/28132858 [00:28<00:30, 554296.88it/s][A
Building vocab from file:  40%|████      | 11344007/28132858 [00:28<00:28, 580160.71it/s][A
Building vocab from file:  41%|████      | 11409417/28132858 [00:28<00:27, 600523.88it/s][A
Building vocab from file:  41%|████      | 11473911/28132858 [00:28<00:27, 613190.88it/s][A
Building vocab from file:  41%|████      | 11538097/28132858 [00:28<00:27, 601541.01it/s][A
Building vocab from file:  41%|████      | 11600307/28132858 [00:28<00:27, 591629.95it/s][A
Building vocab from file:  41%|████▏     | 11665196/28132858 [00:28<00:27, 607717.78it/s][A
Building vocab from file:  42%|████▏     | 11727110/28132858 [00:28<00:26, 608736.50it/s][A
Building vocab from file:  42%|████▏     | 11788784/28132858 [00:28<00:27, 600477.23it/s][A
Building vocab from file:  42%|████▏     | 11849417/28132858 [00:29<00

Building vocab from file:  60%|█████▉    | 16867964/28132858 [00:41<00:17, 632037.39it/s][A
Building vocab from file:  60%|██████    | 16931187/28132858 [00:41<00:17, 625557.05it/s][A
Building vocab from file:  60%|██████    | 16993772/28132858 [00:41<00:18, 609599.59it/s][A
Building vocab from file:  61%|██████    | 17055987/28132858 [00:41<00:18, 613309.23it/s][A
Building vocab from file:  61%|██████    | 17117401/28132858 [00:41<00:18, 604623.59it/s][A
Building vocab from file:  61%|██████    | 17178325/28132858 [00:41<00:18, 605998.73it/s][A
Building vocab from file:  61%|██████▏   | 17239870/28132858 [00:41<00:17, 608801.51it/s][A
Building vocab from file:  61%|██████▏   | 17300796/28132858 [00:43<01:54, 94904.80it/s] [A
Building vocab from file:  62%|██████▏   | 17360876/28132858 [00:43<01:24, 126981.68it/s][A
Building vocab from file:  62%|██████▏   | 17421416/28132858 [00:43<01:04, 166440.56it/s][A
Building vocab from file:  62%|██████▏   | 17477964/28132858 [00:43<00

Building vocab from file:  79%|███████▉  | 22247220/28132858 [00:57<01:20, 72998.28it/s] [A
Building vocab from file:  79%|███████▉  | 22307646/28132858 [00:57<00:58, 99149.85it/s][A
Building vocab from file:  80%|███████▉  | 22366667/28132858 [00:57<00:43, 132129.78it/s][A
Building vocab from file:  80%|███████▉  | 22427164/28132858 [00:57<00:33, 172595.87it/s][A
Building vocab from file:  80%|███████▉  | 22490138/28132858 [00:57<00:25, 220647.98it/s][A
Building vocab from file:  80%|████████  | 22552591/28132858 [00:57<00:20, 273759.64it/s][A
Building vocab from file:  80%|████████  | 22616844/28132858 [00:58<00:16, 330699.58it/s][A
Building vocab from file:  81%|████████  | 22677344/28132858 [00:58<00:14, 382470.69it/s][A
Building vocab from file:  81%|████████  | 22740461/28132858 [00:58<00:12, 433742.75it/s][A
Building vocab from file:  81%|████████  | 22803022/28132858 [00:58<00:11, 477692.99it/s][A
Building vocab from file:  81%|████████▏ | 22865713/28132858 [00:58<00:

Building vocab from file:  96%|█████████▌| 26896197/28132858 [01:15<00:26, 46203.06it/s][A
Building vocab from file:  96%|█████████▌| 26943998/28132858 [01:15<00:18, 63378.89it/s][A
Building vocab from file:  96%|█████████▌| 26993957/28132858 [01:15<00:13, 85872.39it/s][A
Building vocab from file:  96%|█████████▌| 27043211/28132858 [01:15<00:09, 114145.78it/s][A
Building vocab from file:  96%|█████████▋| 27094940/28132858 [01:15<00:06, 148976.61it/s][A
Building vocab from file:  96%|█████████▋| 27142214/28132858 [01:15<00:05, 187500.24it/s][A
Building vocab from file:  97%|█████████▋| 27189458/28132858 [01:16<00:04, 223976.61it/s][A
Building vocab from file:  97%|█████████▋| 27240094/28132858 [01:16<00:03, 268975.60it/s][A
Building vocab from file:  97%|█████████▋| 27290913/28132858 [01:16<00:02, 313204.37it/s][A
Building vocab from file:  97%|█████████▋| 27339734/28132858 [01:16<00:02, 350944.38it/s][A
Building vocab from file:  97%|█████████▋| 27391209/28132858 [01:16<00:01

Length of vocab: 5762539
User count: 5697629
Subreddit count: 64910
User to politic counts: 2589
[('WatchMaga', Counter({'Republican': 2})), ('BasedMedicalDoctor', Counter({'Republican': 2})), ('Damemezaredadremez', Counter({'Republican': 2})), ('joeohegna', Counter({'Republican': 1})), ('Deplorable_scum', Counter({'Republican': 2})), ('TheRabbidHD', Counter({'Republican': 2})), ('Paladin-Arda', Counter({'Democrat': 2})), ('Trumpwillalwayswin', Counter({'Republican': 2})), ('daw-nee-yale', Counter({'Republican': 1})), ('BigcatTV', Counter({'Republican': 2}))]
Saw political affiliations for 2588 users
User to politics training size: {}: 2330
User to politics validation size: {}: 258


Converting data to PyTorch: 100%|██████████| 5697629/5697629 [08:35<00:00, 11053.90it/s]


Train size: 151917434 Validation size: 16879714


In [3]:
reps, dems = 0, 0

for user, label in pol_validation.items():
    if label == 0:
        dems += 1
    else:
        reps += 1

total = reps + dems
print("Percent republicans: {}".format(reps/total))
print("Percent democrats: {}".format(dems/total))

Percent republicans: 0.8992248062015504
Percent democrats: 0.10077519379844961


## Sklearn Dummy Classifiers

In [4]:
user_ids, pol_labels = [], []

for user, pol_label in pol_validation.items():
    try:
        user_ids.append(dataset.user_to_idx[user])
        pol_labels.append(pol_label)
    except KeyError:
        pass

In [5]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(user_ids, pol_labels)
dummy_clf.predict(user_ids)
dummy_clf.score(user_ids, pol_labels)

0.8927038626609443

# Single Month Analysis Not Using Predictions

In [6]:
def read_political_affiliations(files):
    user_to_politic_counts = defaultdict(Counter)

    for fname in tqdm(files):
        with open(fname, 'rt') as f:
            for line in f:
                user, politics, freq = line.split('\t')
                user_to_politic_counts[user][politics] += int(freq)

    print("User to politic counts: " + str(len(user_to_politic_counts)))
    print(list(user_to_politic_counts.items())[:10])

    user_to_politics = {}
    for u, pc in user_to_politic_counts.items():
        if len(pc) > 1:
            continue
        user_to_politics[u] = list(pc.keys())[0]

    print('Saw political affiliations for %d users' % len(user_to_politics))
    return convert_affiliations_to_binary(user_to_politics)


def convert_affiliations_to_binary(user_to_politics):
    for user, politics in user_to_politics.items():
        if politics == "Democrat":
            user_to_politics[user] = 0
        else:
            user_to_politics[user] = 1

    return user_to_politics

flair_directory = "/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/" + year_month + ".tsv"
flair_files = glob.glob(flair_directory)
user_to_politics = read_political_affiliations(flair_files)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


User to politic counts: 2589
[('WatchMaga', Counter({'Republican': 2})), ('BasedMedicalDoctor', Counter({'Republican': 2})), ('Damemezaredadremez', Counter({'Republican': 2})), ('joeohegna', Counter({'Republican': 1})), ('Deplorable_scum', Counter({'Republican': 2})), ('TheRabbidHD', Counter({'Republican': 2})), ('Paladin-Arda', Counter({'Democrat': 2})), ('Trumpwillalwayswin', Counter({'Republican': 2})), ('daw-nee-yale', Counter({'Republican': 1})), ('BigcatTV', Counter({'Republican': 2}))]
Saw political affiliations for 2588 users


In [7]:
# Distribution
reps, dems = 0, 0
for k, v in user_to_politics.items():
    if v == 1:
        reps += 1
    else:
        dems += 1
        
total = reps + dems
print("Percent republicans: {}".format(reps/total))
print("Percent democrats: {}".format(dems/total))

Percent republicans: 0.8941267387944358
Percent democrats: 0.10587326120556415


## Polarized Subreddits

In [8]:
user_subreddits = dataset.user_subreddits
subreddit_scores = defaultdict(lambda:0)
subreddit_counts = Counter()
subreddit_users = defaultdict(set)

MIN_POST_THRESHOLD = 20

for user, score in user_to_politics.items():
    subreddits = user_subreddits[user]
    for sub in subreddits:
        if sub[2:4] != 'u_':
            subreddit_scores[sub] += score
            subreddit_counts[sub] += 1
            subreddit_users[sub].add(user)

norm_sub_scores = {}

for sub, score in subreddit_scores.items():
    count = subreddit_counts[sub]
    if count >= 20:
        norm_sub_scores[sub] = score / count
    
sorted_sub_scores = {k: v for k, v in sorted(norm_sub_scores.items(), key=lambda item: item[1], reverse=True)}
top_results = dict(itertools.islice(sorted_sub_scores.items(), 25))

print("Top 25 right leaning subreddits")

for sub, score in top_results.items():
    print(sub, score, subreddit_counts[sub])

Top 25 right leaning subreddits
r/TumblrInAction 1.0 55
r/progun 1.0 59
r/Firearms 1.0 48
r/Catholicism 1.0 32
r/prolife 1.0 29
r/Cringetopia 1.0 42
r/The_MuellerMeltdown 1.0 51
r/drumpfisfinished 1.0 29
r/ShitPoliticsSays 1.0 105
r/TheNewRight 1.0 35
r/tucker_carlson 1.0 21
r/RightwingLGBT 1.0 33
r/benshapiro 1.0 30
r/ar15 1.0 28
r/weekendgunnit 1.0 24
r/gunpolitics 1.0 32
r/walkaway 1.0 29
r/ImGoingToHellForThis 1.0 50
r/Rainbow6 1.0 29
r/gundeals 1.0 45
r/frenworld 1.0 27
r/ZooForTheLowIQRight 1.0 20
r/awfuleverything 1.0 20
r/texas 1.0 21
r/MensRights 1.0 52


In [9]:
sorted_sub_scores = {k: v for k, v in sorted(norm_sub_scores.items(), key=lambda item: item[1], reverse=False)}
top_results = dict(itertools.islice(sorted_sub_scores.items(), 25))

print("Top 25 left leaning subreddits")

for sub, score in top_results.items():
    print(sub, score, subreddit_counts[sub])

Top 25 left leaning subreddits
r/AskALiberal 0.16201117318435754 179
r/centerleftpolitics 0.18181818181818182 22
r/CanadaPolitics 0.358974358974359 39
r/neoliberal 0.45454545454545453 33
r/ENLIGHTENEDCENTRISM 0.5416666666666666 24
r/AskTrumpSupporters 0.5666666666666667 60
r/PoliticalDiscussion 0.6122448979591837 49
r/changemyview 0.6326530612244898 49
r/bestof 0.64 25
r/ProgrammerHumor 0.6451612903225806 31
r/AskConservatives 0.6666666666666666 21
r/AskAnAmerican 0.671875 64
r/canada 0.6756756756756757 37
r/Gamingcirclejerk 0.6956521739130435 23
r/blackmagicfuckery 0.72 25
r/Economics 0.7352941176470589 34
r/TopMindsOfReddit 0.7391304347826086 46
r/europe 0.7391304347826086 23
r/SubredditDrama 0.7419354838709677 31
r/MovieDetails 0.75 28
r/atheism 0.75 44
r/ChapoTrapHouse 0.75 32
r/moderatepolitics 0.7619047619047619 21
r/MHOC 0.7727272727272727 22
r/CFB 0.7777777777777778 27


## Major Subreddit Distributions

In [10]:
import seaborn as sns

major_subreddits = ['r/politics', 'r/pics', 'r/AskReddit', 'r/Conservative', 'r/Liberal']

def plot_sub_scores(subreddit, subreddit_users):
    sns.set_theme(style="darkgrid")
    scores = []
    
    for user in subreddit_users[subreddit]:
        score = user_to_politics[user]
        scores.append(score)
        
    sns.displot(scores).set(title=subreddit)
#     sns.plt.xlim(0, 1)
#     sns.plt.set_title(subreddit)
#     sns.plt.show()

for sub in major_subreddits:
    plot_sub_scores(sub, subreddit_users)


Bad key "text.kerning_factor" on line 4 in
/opt/anaconda/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


# SVD On Subreddit Embeddings

In [11]:
from sklearn.decomposition import TruncatedSVD

# Load in the model
PATH = '/shared/0/projects/reddit-political-affiliation/working-dir/word2vec-outputs/' + year_month + '/9.pt'
embedding_dim = 50
model = User2Subreddit(dataset.num_users(), embedding_dim, len(dataset.subreddit_to_idx))
model.load_state_dict(torch.load(PATH, map_location=torch.device('cpu')))
print(model.eval())

# Compute SVD on the subreddit embeddings
svd = TruncatedSVD(n_components=10, n_iter=7, random_state=42)
X = model.v_embeddings.weight.detach().numpy()
svd.fit(X)

User2Subreddit(
  (u_embeddings): Embedding(5697629, 50)
  (v_embeddings): Embedding(64910, 50)
  (political_layer): Linear(in_features=50, out_features=1, bias=True)
  (before_pol_dropout): Dropout(p=0.5, inplace=False)
)


TruncatedSVD(algorithm='randomized', n_components=10, n_iter=7, random_state=42,
             tol=0.0)

In [12]:
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)

[0.02673197 0.02344752 0.02207426 0.02170555 0.02122099 0.02109843
 0.0208639  0.02070619 0.02056793 0.02037031]
0.21878704
[100.16848  95.11045  90.69617  89.8216   88.80923  88.55945  88.07457
  87.72496  87.43373  87.01421]


# Multi-month Analysis Not Using Predictions

In [13]:
# Load in multiple months
year_months = ['2019-04', '2019-05', '2019-06']
models = {}
datasets = {}
for year_month in year_months:
    network_path = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/' + year_month + '_filtered.tsv'
    flair_directory = '/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/' + year_month + '.tsv'

    dataset, training, validation, pol_validation, vocab = build_dataset(network_path, flair_directory)
    PATH = '/shared/0/projects/reddit-political-affiliation/working-dir/word2vec-outputs/' + year_month + '/9.pt'
    embedding_dim = 50
    model = User2Subreddit(dataset.num_users(), embedding_dim, len(dataset.subreddit_to_idx))
    model.load_state_dict(torch.load(PATH, map_location=torch.device('cpu')))
    models[year_month] = model
    datasets[year_month] = dataset


Building vocab from file:   0%|          | 0/28132858 [00:00<?, ?it/s][A
Building vocab from file:   0%|          | 1371/28132858 [00:00<3:14:50, 2406.42it/s][A
Building vocab from file:   0%|          | 24498/28132858 [00:01<2:19:32, 3357.03it/s][A
Building vocab from file:   0%|          | 94301/28132858 [00:01<1:37:38, 4785.89it/s][A
Building vocab from file:   1%|          | 164248/28132858 [00:01<1:08:22, 6816.99it/s][A
Building vocab from file:   1%|          | 229823/28132858 [00:01<47:57, 9695.37it/s]  [A
Building vocab from file:   1%|          | 296415/28132858 [00:01<33:42, 13761.30it/s][A
Building vocab from file:   1%|▏         | 373038/28132858 [00:01<23:42, 19508.84it/s][A
Building vocab from file:   2%|▏         | 442745/28132858 [00:01<16:45, 27539.45it/s][A
Building vocab from file:   2%|▏         | 504632/28132858 [00:01<12:19, 37360.92it/s][A
Building vocab from file:   2%|▏         | 554630/28132858 [00:02<10:38, 43177.13it/s][A
Building vocab from file

Building vocab from file:  22%|██▏       | 6056795/28132858 [00:10<00:33, 666298.56it/s][A
Building vocab from file:  22%|██▏       | 6123751/28132858 [00:10<00:34, 644869.44it/s][A
Building vocab from file:  22%|██▏       | 6188616/28132858 [00:11<00:34, 641702.37it/s][A
Building vocab from file:  22%|██▏       | 6253573/28132858 [00:11<00:33, 644040.47it/s][A
Building vocab from file:  22%|██▏       | 6318167/28132858 [00:11<00:35, 615897.11it/s][A
Building vocab from file:  23%|██▎       | 6380150/28132858 [00:11<00:36, 602249.71it/s][A
Building vocab from file:  23%|██▎       | 6443164/28132858 [00:11<00:35, 610352.55it/s][A
Building vocab from file:  23%|██▎       | 6507228/28132858 [00:11<00:34, 619131.37it/s][A
Building vocab from file:  23%|██▎       | 6574174/28132858 [00:11<00:34, 633415.95it/s][A
Building vocab from file:  24%|██▎       | 6641519/28132858 [00:11<00:33, 644915.29it/s][A
Building vocab from file:  24%|██▍       | 6706225/28132858 [00:11<00:33, 643710

Building vocab from file:  42%|████▏     | 11776947/28132858 [00:19<00:25, 637296.06it/s][A
Building vocab from file:  42%|████▏     | 11840693/28132858 [00:20<00:26, 619123.51it/s][A
Building vocab from file:  42%|████▏     | 11904826/28132858 [00:20<00:26, 620552.24it/s][A
Building vocab from file:  43%|████▎     | 11972043/28132858 [00:20<00:25, 635183.12it/s][A
Building vocab from file:  43%|████▎     | 12038048/28132858 [00:20<00:25, 642442.14it/s][A
Building vocab from file:  43%|████▎     | 12104766/28132858 [00:20<00:24, 649667.99it/s][A
Building vocab from file:  43%|████▎     | 12169839/28132858 [00:20<00:24, 649296.62it/s][A
Building vocab from file:  43%|████▎     | 12237047/28132858 [00:20<00:24, 655966.13it/s][A
Building vocab from file:  44%|████▎     | 12302712/28132858 [00:20<00:24, 654299.79it/s][A
Building vocab from file:  44%|████▍     | 12369371/28132858 [00:20<00:23, 657936.86it/s][A
Building vocab from file:  44%|████▍     | 12435204/28132858 [00:20<00

Building vocab from file:  61%|██████▏   | 17234895/28132858 [00:48<00:17, 614982.55it/s][A
Building vocab from file:  61%|██████▏   | 17297106/28132858 [00:48<00:17, 613051.66it/s][A
Building vocab from file:  62%|██████▏   | 17359374/28132858 [00:48<00:17, 615906.75it/s][A
Building vocab from file:  62%|██████▏   | 17420998/28132858 [00:48<00:17, 614790.14it/s][A
Building vocab from file:  62%|██████▏   | 17482501/28132858 [00:48<00:17, 612147.94it/s][A
Building vocab from file:  62%|██████▏   | 17543884/28132858 [00:49<00:17, 612648.26it/s][A
Building vocab from file:  63%|██████▎   | 17605163/28132858 [00:49<00:17, 609955.52it/s][A
Building vocab from file:  63%|██████▎   | 17666849/28132858 [00:49<00:17, 612009.02it/s][A
Building vocab from file:  63%|██████▎   | 17728061/28132858 [00:49<00:17, 596213.56it/s][A
Building vocab from file:  63%|██████▎   | 17791466/28132858 [00:49<00:17, 607081.45it/s][A
Building vocab from file:  63%|██████▎   | 17854980/28132858 [00:49<00

Building vocab from file:  80%|███████▉  | 22437714/28132858 [00:57<00:09, 588545.65it/s][A
Building vocab from file:  80%|███████▉  | 22496678/28132858 [00:58<00:10, 528510.26it/s][A
Building vocab from file:  80%|████████  | 22550760/28132858 [00:58<00:10, 526403.97it/s][A
Building vocab from file:  80%|████████  | 22605166/28132858 [00:58<00:10, 531578.30it/s][A
Building vocab from file:  81%|████████  | 22661894/28132858 [00:58<00:10, 541806.93it/s][A
Building vocab from file:  81%|████████  | 22722560/28132858 [00:58<00:09, 559756.26it/s][A
Building vocab from file:  81%|████████  | 22784160/28132858 [00:58<00:09, 575519.21it/s][A
Building vocab from file:  81%|████████  | 22845298/28132858 [00:58<00:09, 585827.17it/s][A
Building vocab from file:  81%|████████▏ | 22905177/28132858 [00:58<00:08, 589655.21it/s][A
Building vocab from file:  82%|████████▏ | 22941907/28132858 [01:14<10:52, 7951.62it/s]  [A
Building vocab from file:  82%|████████▏ | 22941907/28132858 [01:14<10

Building vocab from file:  97%|█████████▋| 27324913/28132858 [01:22<00:01, 503504.46it/s][A
Building vocab from file:  97%|█████████▋| 27375959/28132858 [01:22<00:01, 505570.92it/s][A
Building vocab from file:  97%|█████████▋| 27426548/28132858 [01:22<00:01, 502746.35it/s][A
Building vocab from file:  98%|█████████▊| 27476849/28132858 [01:22<00:01, 487497.40it/s][A
Building vocab from file:  98%|█████████▊| 27525718/28132858 [01:22<00:01, 482542.76it/s][A
Building vocab from file:  98%|█████████▊| 27574067/28132858 [01:22<00:01, 457099.25it/s][A
Building vocab from file:  98%|█████████▊| 27620122/28132858 [01:22<00:01, 451833.03it/s][A
Building vocab from file:  98%|█████████▊| 27665561/28132858 [01:22<00:01, 445093.38it/s][A
Building vocab from file:  98%|█████████▊| 27710272/28132858 [01:22<00:00, 437610.12it/s][A
Building vocab from file:  99%|█████████▊| 27754203/28132858 [01:22<00:00, 429555.61it/s][A
Building vocab from file:  99%|█████████▉| 27797311/28132858 [01:23<00

Length of vocab: 5762539
User count: 5697629
Subreddit count: 64910
User to politic counts: 2589
[('WatchMaga', Counter({'Republican': 2})), ('BasedMedicalDoctor', Counter({'Republican': 2})), ('Damemezaredadremez', Counter({'Republican': 2})), ('joeohegna', Counter({'Republican': 1})), ('Deplorable_scum', Counter({'Republican': 2})), ('TheRabbidHD', Counter({'Republican': 2})), ('Paladin-Arda', Counter({'Democrat': 2})), ('Trumpwillalwayswin', Counter({'Republican': 2})), ('daw-nee-yale', Counter({'Republican': 1})), ('BigcatTV', Counter({'Republican': 2}))]
Saw political affiliations for 2588 users
User to politics training size: {}: 2330
User to politics validation size: {}: 258


Converting data to PyTorch: 100%|██████████| 5697629/5697629 [08:51<00:00, 10727.51it/s]


Train size: 151917434 Validation size: 16879714



Building vocab from file:   0%|          | 0/29288824 [00:00<?, ?it/s][A
Building vocab from file:   0%|          | 11430/29288824 [00:09<6:42:23, 1212.62it/s][A
Building vocab from file:   0%|          | 24447/29288824 [00:11<5:09:50, 1574.19it/s][A
Building vocab from file:   0%|          | 82987/29288824 [00:12<3:36:42, 2246.25it/s][A
Building vocab from file:   0%|          | 145457/29288824 [00:12<2:31:35, 3203.99it/s][A
Building vocab from file:   1%|          | 213818/29288824 [00:12<1:46:05, 4567.77it/s][A
Building vocab from file:   1%|          | 290799/29288824 [00:12<1:14:15, 6508.83it/s][A
Building vocab from file:   1%|          | 359881/29288824 [00:12<52:03, 9260.94it/s]  [A
Building vocab from file:   1%|▏         | 430729/29288824 [00:12<36:33, 13156.21it/s][A
Building vocab from file:   2%|▏         | 503527/29288824 [00:12<25:43, 18649.87it/s][A
Building vocab from file:   2%|▏         | 568334/29288824 [00:12<18:12, 26291.69it/s][A
Building vocab from f

Building vocab from file:  21%|██▏       | 6229396/29288824 [00:20<00:33, 693820.22it/s][A
Building vocab from file:  22%|██▏       | 6301957/29288824 [00:20<00:32, 703058.05it/s][A
Building vocab from file:  22%|██▏       | 6372561/29288824 [00:21<00:32, 703947.68it/s][A
Building vocab from file:  22%|██▏       | 6443046/29288824 [00:21<00:34, 665199.26it/s][A
Building vocab from file:  22%|██▏       | 6510071/29288824 [00:21<00:34, 662454.42it/s][A
Building vocab from file:  22%|██▏       | 6581790/29288824 [00:21<00:33, 677977.01it/s][A
Building vocab from file:  23%|██▎       | 6651873/29288824 [00:21<00:33, 684672.67it/s][A
Building vocab from file:  23%|██▎       | 6721811/29288824 [00:21<00:32, 689019.21it/s][A
Building vocab from file:  23%|██▎       | 6792412/29288824 [00:21<00:32, 694028.69it/s][A
Building vocab from file:  23%|██▎       | 6861952/29288824 [00:21<00:33, 668970.26it/s][A
Building vocab from file:  24%|██▎       | 6932200/29288824 [00:21<00:32, 678680

Building vocab from file:  42%|████▏     | 12323307/29288824 [00:29<00:25, 659853.68it/s][A
Building vocab from file:  42%|████▏     | 12392200/29288824 [00:30<00:25, 668314.36it/s][A
Building vocab from file:  43%|████▎     | 12460167/29288824 [00:30<00:25, 671679.77it/s][A
Building vocab from file:  43%|████▎     | 12529034/29288824 [00:30<00:24, 676686.92it/s][A
Building vocab from file:  43%|████▎     | 12597061/29288824 [00:30<00:24, 677756.14it/s][A
Building vocab from file:  43%|████▎     | 12664935/29288824 [00:30<00:26, 633523.70it/s][A
Building vocab from file:  43%|████▎     | 12733719/29288824 [00:30<00:25, 648662.45it/s][A
Building vocab from file:  44%|████▎     | 12802517/29288824 [00:30<00:24, 659976.90it/s][A
Building vocab from file:  44%|████▍     | 12871233/29288824 [00:30<00:24, 667902.09it/s][A
Building vocab from file:  44%|████▍     | 12940222/29288824 [00:30<00:24, 674345.82it/s][A
Building vocab from file:  44%|████▍     | 13008078/29288824 [00:31<00

Building vocab from file:  62%|██████▏   | 18236817/29288824 [00:39<00:17, 643250.87it/s][A
Building vocab from file:  62%|██████▏   | 18302298/29288824 [00:39<00:16, 646674.80it/s][A
Building vocab from file:  63%|██████▎   | 18367064/29288824 [00:39<00:17, 622237.95it/s][A
Building vocab from file:  63%|██████▎   | 18433187/29288824 [00:39<00:17, 633442.89it/s][A
Building vocab from file:  63%|██████▎   | 18496768/29288824 [00:39<00:19, 563682.85it/s][A
Building vocab from file:  63%|██████▎   | 18558974/29288824 [00:39<00:18, 580011.59it/s][A
Building vocab from file:  64%|██████▎   | 18624385/29288824 [00:39<00:17, 600416.13it/s][A
Building vocab from file:  64%|██████▍   | 18689540/29288824 [00:39<00:17, 614890.50it/s][A
Building vocab from file:  64%|██████▍   | 18755087/29288824 [00:40<00:16, 626521.83it/s][A
Building vocab from file:  64%|██████▍   | 18818366/29288824 [00:40<00:16, 621408.59it/s][A
Building vocab from file:  64%|██████▍   | 18880955/29288824 [00:40<00

Building vocab from file:  80%|████████  | 23541426/29288824 [01:12<00:09, 601551.48it/s][A
Building vocab from file:  81%|████████  | 23602458/29288824 [01:12<00:09, 604153.50it/s][A
Building vocab from file:  81%|████████  | 23662990/29288824 [01:12<00:09, 603895.80it/s][A
Building vocab from file:  81%|████████  | 23725547/29288824 [01:12<00:09, 609405.91it/s][A
Building vocab from file:  81%|████████  | 23788255/29288824 [01:12<00:08, 614602.38it/s][A
Building vocab from file:  81%|████████▏ | 23849774/29288824 [01:12<00:08, 611713.67it/s][A
Building vocab from file:  82%|████████▏ | 23911111/29288824 [01:13<00:08, 612209.37it/s][A
Building vocab from file:  82%|████████▏ | 23972363/29288824 [01:13<00:08, 602383.55it/s][A
Building vocab from file:  82%|████████▏ | 24033044/29288824 [01:13<00:08, 603695.84it/s][A
Building vocab from file:  82%|████████▏ | 24093455/29288824 [01:13<00:08, 601710.22it/s][A
Building vocab from file:  82%|████████▏ | 24153710/29288824 [01:13<00

Building vocab from file:  97%|█████████▋| 28475106/29288824 [01:21<00:01, 484574.24it/s][A
Building vocab from file:  97%|█████████▋| 28524509/29288824 [01:21<00:01, 487370.47it/s][A
Building vocab from file:  98%|█████████▊| 28573367/29288824 [01:21<00:01, 483669.82it/s][A
Building vocab from file:  98%|█████████▊| 28621825/29288824 [01:21<00:01, 466892.70it/s][A
Building vocab from file:  98%|█████████▊| 28670891/29288824 [01:21<00:01, 473777.36it/s][A
Building vocab from file:  98%|█████████▊| 28723181/29288824 [01:21<00:01, 487516.76it/s][A
Building vocab from file:  98%|█████████▊| 28772143/29288824 [01:22<00:01, 308436.75it/s][A
Building vocab from file:  98%|█████████▊| 28821427/29288824 [01:22<00:01, 347435.61it/s][A
Building vocab from file:  99%|█████████▊| 28863691/29288824 [01:22<00:02, 175636.47it/s][A
Building vocab from file:  99%|█████████▊| 28908240/29288824 [01:22<00:01, 214641.72it/s][A
Building vocab from file:  99%|█████████▉| 28946712/29288824 [01:22<00

Length of vocab: 5934585
User count: 5868142
Subreddit count: 66443
User to politic counts: 2626
[('Toad0430', Counter({'Republican': 2})), ('stephen89', Counter({'Republican': 2})), ('Friar_Strzok', Counter({'Republican': 1})), ('IntegrateIt', Counter({'Republican': 1})), ('justinandashley', Counter({'Republican': 1})), ('Hillarys_cellmate', Counter({'Republican': 2})), ('ARandomConservative', Counter({'Republican': 1})), ('Personage1', Counter({'Democrat': 1})), ('joebo20_00', Counter({'Republican': 1})), ('LeChevalierMal-Fait', Counter({'Republican': 5}))]
Saw political affiliations for 2626 users
User to politics training size: {}: 2364
User to politics validation size: {}: 262


Converting data to PyTorch: 100%|██████████| 5868142/5868142 [09:32<00:00, 10255.31it/s]


Train size: 158159650 Validation size: 17573294



Building vocab from file:   0%|          | 0/30972308 [00:00<?, ?it/s][A
Building vocab from file:   0%|          | 2950/30972308 [00:04<14:01:13, 613.57it/s][A
Building vocab from file:   0%|          | 27326/30972308 [00:08<10:09:05, 846.75it/s][A
Building vocab from file:   0%|          | 84270/30972308 [00:08<7:05:50, 1208.88it/s][A
Building vocab from file:   0%|          | 147380/30972308 [00:08<4:57:43, 1725.55it/s][A
Building vocab from file:   1%|          | 215952/30972308 [00:08<3:28:10, 2462.42it/s][A
Building vocab from file:   1%|          | 291377/30972308 [00:08<2:25:33, 3512.83it/s][A
Building vocab from file:   1%|          | 345798/30972308 [00:08<1:41:59, 5004.48it/s][A
Building vocab from file:   1%|▏         | 414950/30972308 [00:08<1:11:27, 7127.16it/s][A
Building vocab from file:   2%|▏         | 486790/30972308 [00:08<50:06, 10138.54it/s] [A
Building vocab from file:   2%|▏         | 563116/30972308 [00:08<35:11, 14401.65it/s][A
Building vocab from 

Building vocab from file:  20%|█▉        | 6057148/30972308 [00:18<00:37, 657948.19it/s][A
Building vocab from file:  20%|█▉        | 6127814/30972308 [00:18<00:36, 671839.59it/s][A
Building vocab from file:  20%|██        | 6197980/30972308 [00:18<00:36, 680515.85it/s][A
Building vocab from file:  20%|██        | 6269127/30972308 [00:18<00:35, 689513.34it/s][A
Building vocab from file:  20%|██        | 6343936/30972308 [00:19<00:34, 706099.21it/s][A
Building vocab from file:  21%|██        | 6414822/30972308 [00:19<00:34, 706026.95it/s][A
Building vocab from file:  21%|██        | 6486206/30972308 [00:19<00:34, 708350.03it/s][A
Building vocab from file:  21%|██        | 6557280/30972308 [00:19<00:34, 709042.77it/s][A
Building vocab from file:  21%|██▏       | 6628281/30972308 [00:19<00:36, 670862.11it/s][A
Building vocab from file:  22%|██▏       | 6699131/30972308 [00:19<00:35, 681726.04it/s][A
Building vocab from file:  22%|██▏       | 6767692/30972308 [00:19<00:36, 666748

Building vocab from file:  39%|███▉      | 12132094/30972308 [00:27<00:28, 663051.57it/s][A
Building vocab from file:  39%|███▉      | 12201235/30972308 [00:27<00:27, 671311.52it/s][A
Building vocab from file:  40%|███▉      | 12270388/30972308 [00:27<00:27, 677251.54it/s][A
Building vocab from file:  40%|███▉      | 12338630/30972308 [00:28<00:27, 678793.55it/s][A
Building vocab from file:  40%|████      | 12407419/30972308 [00:28<00:27, 681495.38it/s][A
Building vocab from file:  40%|████      | 12476714/30972308 [00:28<00:27, 684886.68it/s][A
Building vocab from file:  41%|████      | 12545286/30972308 [00:28<00:28, 637582.21it/s][A
Building vocab from file:  41%|████      | 12609759/30972308 [00:28<00:29, 612711.25it/s][A
Building vocab from file:  41%|████      | 12671738/30972308 [00:28<00:30, 597225.92it/s][A
Building vocab from file:  41%|████      | 12732043/30972308 [00:28<00:30, 591202.50it/s][A
Building vocab from file:  41%|████▏     | 12793962/30972308 [00:28<00

Building vocab from file:  58%|█████▊    | 17976511/30972308 [00:36<00:19, 663592.23it/s][A
Building vocab from file:  58%|█████▊    | 18043910/30972308 [00:37<00:19, 663123.57it/s][A
Building vocab from file:  58%|█████▊    | 18112703/30972308 [00:37<00:19, 670374.39it/s][A
Building vocab from file:  59%|█████▊    | 18179798/30972308 [00:37<00:19, 665534.74it/s][A
Building vocab from file:  59%|█████▉    | 18246400/30972308 [00:37<00:19, 663681.06it/s][A
Building vocab from file:  59%|█████▉    | 18315192/30972308 [00:37<00:18, 670765.58it/s][A
Building vocab from file:  59%|█████▉    | 18382312/30972308 [00:37<00:19, 660822.14it/s][A
Building vocab from file:  60%|█████▉    | 18448458/30972308 [00:37<00:19, 658177.70it/s][A
Building vocab from file:  60%|█████▉    | 18514323/30972308 [00:37<00:18, 657372.55it/s][A
Building vocab from file:  60%|█████▉    | 18580094/30972308 [00:37<00:18, 655070.71it/s][A
Building vocab from file:  60%|██████    | 18645626/30972308 [00:37<00

Building vocab from file:  76%|███████▌  | 23548872/30972308 [00:46<00:12, 614149.43it/s][A
Building vocab from file:  76%|███████▌  | 23610589/30972308 [00:46<00:12, 573026.01it/s][A
Building vocab from file:  76%|███████▋  | 23668659/30972308 [00:46<00:13, 552615.12it/s][A
Building vocab from file:  77%|███████▋  | 23724614/30972308 [00:46<00:14, 504170.90it/s][A
Building vocab from file:  77%|███████▋  | 23785772/30972308 [00:46<00:13, 532211.38it/s][A
Building vocab from file:  77%|███████▋  | 23847391/30972308 [00:46<00:12, 554898.09it/s][A
Building vocab from file:  77%|███████▋  | 23910589/30972308 [00:47<00:12, 575971.24it/s][A
Building vocab from file:  77%|███████▋  | 23969220/30972308 [00:47<00:12, 553142.50it/s][A
Building vocab from file:  78%|███████▊  | 24025452/30972308 [00:47<00:12, 547207.03it/s][A
Building vocab from file:  78%|███████▊  | 24082207/30972308 [00:47<00:12, 553127.14it/s][A
Building vocab from file:  78%|███████▊  | 24137995/30972308 [00:47<00

Building vocab from file:  93%|█████████▎| 28664134/30972308 [01:30<00:04, 559947.36it/s][A
Building vocab from file:  93%|█████████▎| 28721539/30972308 [01:31<00:03, 564103.03it/s][A
Building vocab from file:  93%|█████████▎| 28778026/30972308 [01:31<00:04, 541859.60it/s][A
Building vocab from file:  93%|█████████▎| 28833817/30972308 [01:31<00:03, 545825.88it/s][A
Building vocab from file:  93%|█████████▎| 28891410/30972308 [01:31<00:03, 554520.41it/s][A
Building vocab from file:  93%|█████████▎| 28947019/30972308 [01:31<00:04, 489606.48it/s][A
Building vocab from file:  94%|█████████▎| 28997482/30972308 [01:31<00:04, 486928.74it/s][A
Building vocab from file:  94%|█████████▍| 29048061/30972308 [01:31<00:03, 492435.82it/s][A
Building vocab from file:  94%|█████████▍| 29102828/30972308 [01:31<00:03, 507798.55it/s][A
Building vocab from file:  94%|█████████▍| 29158437/30972308 [01:31<00:03, 521378.75it/s][A
Building vocab from file:  94%|█████████▍| 29211119/30972308 [01:32<00

Length of vocab: 6138759
User count: 6071436
Subreddit count: 67323
User to politic counts: 2924
[('CaptChrisPBacon', Counter({'Republican': 2})), ('UndergroundSurface', Counter({'Republican': 2})), ('CharlesEarlBoles', Counter({'Republican': 2})), ('mysteryman64030', Counter({'Republican': 2})), ('derangedwinchester', Counter({'Republican': 2})), ('bro1979', Counter({'Republican': 1})), ('Go_to_the_Astra', Counter({'Republican': 2})), ('guanaco55', Counter({'Republican': 2})), ('DokkaeBeast', Counter({'Republican': 1})), ('PropolisBee', Counter({'Republican': 2}))]
Saw political affiliations for 2919 users
User to politics training size: {}: 2628
User to politics validation size: {}: 291


Converting data to PyTorch: 100%|██████████| 6071436/6071436 [09:59<00:00, 10121.16it/s]


Train size: 167250464 Validation size: 18583384


In [14]:
for year_month in year_months:
    dataset = datasets[year_month]
    model = models[year_month]
    user_subreddits = dataset.user_subreddits
    subreddit_scores = defaultdict(lambda:0)
    subreddit_counts = Counter()
    subreddit_users = defaultdict(set)

    MIN_POST_THRESHOLD = 20

    for user, score in user_to_politics.items():
        subreddits = user_subreddits[user]
        for sub in subreddits:
            if sub[2:4] != 'u_':
                subreddit_scores[sub] += score
                subreddit_counts[sub] += 1
                subreddit_users[sub].add(user)

    norm_sub_scores = {}

    for sub, score in subreddit_scores.items():
        count = subreddit_counts[sub]
        if count >= 20:
            norm_sub_scores[sub] = score / count

    sorted_sub_scores = {k: v for k, v in sorted(norm_sub_scores.items(), key=lambda item: item[1], reverse=True)}
    top_results = dict(itertools.islice(sorted_sub_scores.items(), 25))

    print("Top 25 right leaning subreddits for year: {}".format(year_month))

    for sub, score in top_results.items():
        print(sub, score, subreddit_counts[sub])

    sorted_sub_scores = {k: v for k, v in sorted(norm_sub_scores.items(), key=lambda item: item[1], reverse=False)}
    top_results = dict(itertools.islice(sorted_sub_scores.items(), 25))

    print("Top 25 left leaning subreddits for year: {}".format(year_month))

    for sub, score in top_results.items():
        print(sub, score, subreddit_counts[sub])

Top 25 right leaning subreddits for year: 2019-04
r/TumblrInAction 1.0 55
r/progun 1.0 59
r/Firearms 1.0 48
r/Catholicism 1.0 32
r/prolife 1.0 29
r/Cringetopia 1.0 42
r/The_MuellerMeltdown 1.0 51
r/drumpfisfinished 1.0 29
r/ShitPoliticsSays 1.0 105
r/TheNewRight 1.0 35
r/tucker_carlson 1.0 21
r/RightwingLGBT 1.0 33
r/benshapiro 1.0 30
r/ar15 1.0 28
r/weekendgunnit 1.0 24
r/gunpolitics 1.0 32
r/walkaway 1.0 29
r/ImGoingToHellForThis 1.0 50
r/Rainbow6 1.0 29
r/gundeals 1.0 45
r/frenworld 1.0 27
r/ZooForTheLowIQRight 1.0 20
r/awfuleverything 1.0 20
r/texas 1.0 21
r/MensRights 1.0 52
Top 25 left leaning subreddits for year: 2019-04
r/AskALiberal 0.16201117318435754 179
r/centerleftpolitics 0.18181818181818182 22
r/CanadaPolitics 0.358974358974359 39
r/neoliberal 0.45454545454545453 33
r/ENLIGHTENEDCENTRISM 0.5416666666666666 24
r/AskTrumpSupporters 0.5666666666666667 60
r/PoliticalDiscussion 0.6122448979591837 49
r/changemyview 0.6326530612244898 49
r/bestof 0.64 25
r/ProgrammerHumor 0.645