# Sanity Checks

In [1]:
import os 
import sys
import glob

sys.path.append('/home/kalkiek/projects/reddit-political-affiliation/')

import itertools
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from tqdm.notebook import tqdm
from collections import defaultdict, Counter
from sklearn.metrics import accuracy_score, f1_score

from src.data.make_dataset import build_dataset
from src.models.word2vec.User2Subreddit import User2Subreddit

from sklearn.metrics import auc, roc_curve

### Load In Comment Affiliations

In [2]:
year_month = '2019-09'
comment_directory = '/shared/0/projects/reddit-political-affiliation/data/comment-affiliations/*' + year_month + ".tsv"
files = glob.glob(comment_directory)

user_to_politics = {}

for fname in files:
     with open(fname, 'r') as f:
        for line in f:
            user, politics = line.split('\t')
            user_to_politics[user] = politics.strip().lower()

In [3]:
dems, reps = 0, 0
for user, politics in user_to_politics.items():
    if politics == "democrat":
        dems += 1
    else:
        reps += 1

print(dems, reps)

5850 2626


## Distribution of Validation Set

In [4]:
# Load the dataset in
year_month = '2019-09'

network_path = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/' + year_month + '_filtered.tsv'
flair_directory = '/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/' + year_month + '.tsv'
comment_directory = '/shared/0/projects/reddit-political-affiliation/data/comment-affiliations/*' + year_month + ".tsv"
print(comment_directory)

dataset, training, validation, pol_validation, vocab = build_dataset(network_path, flair_directory, comment_directory)

/shared/0/projects/reddit-political-affiliation/data/comment-affiliations/*2019-09.tsv



Building vocab from file:   0%|          | 0/27368767 [00:00<?, ?it/s][A
Building vocab from file:   0%|          | 6546/27368767 [00:00<31:49, 14326.36it/s][A
Building vocab from file:   0%|          | 78923/27368767 [00:00<22:24, 20294.07it/s][A
Building vocab from file:   1%|          | 147498/27368767 [00:00<15:50, 28628.43it/s][A
Building vocab from file:   1%|          | 183481/27368767 [00:01<13:23, 33845.80it/s][A
Building vocab from file:   1%|          | 255307/27368767 [00:01<09:32, 47394.01it/s][A
Building vocab from file:   1%|          | 324353/27368767 [00:01<06:51, 65770.87it/s][A
Building vocab from file:   1%|▏         | 397586/27368767 [00:01<04:58, 90475.92it/s][A
Building vocab from file:   2%|▏         | 465409/27368767 [00:01<03:40, 122261.41it/s][A
Building vocab from file:   2%|▏         | 534815/27368767 [00:01<02:45, 162162.28it/s][A
Building vocab from file:   2%|▏         | 606682/27368767 [00:01<02:06, 211233.17it/s][A
Building vocab from file:

Building vocab from file:  20%|██        | 5506505/27368767 [00:14<01:00, 361732.06it/s][A
Building vocab from file:  20%|██        | 5566363/27368767 [00:14<00:53, 407384.75it/s][A
Building vocab from file:  21%|██        | 5633565/27368767 [00:14<00:47, 461959.20it/s][A
Building vocab from file:  21%|██        | 5696438/27368767 [00:15<00:43, 501897.58it/s][A
Building vocab from file:  21%|██        | 5761537/27368767 [00:15<00:40, 538924.53it/s][A
Building vocab from file:  21%|██▏       | 5824462/27368767 [00:15<00:38, 560930.57it/s][A
Building vocab from file:  22%|██▏       | 5893897/27368767 [00:15<00:36, 595241.35it/s][A
Building vocab from file:  22%|██▏       | 5958581/27368767 [00:15<00:35, 609820.75it/s][A
Building vocab from file:  22%|██▏       | 6025037/27368767 [00:15<00:34, 625270.16it/s][A
Building vocab from file:  22%|██▏       | 6090252/27368767 [00:15<00:34, 614279.29it/s][A
Building vocab from file:  22%|██▏       | 6153605/27368767 [00:15<00:34, 617730

Building vocab from file:  40%|████      | 11080677/27368767 [00:28<00:25, 627901.18it/s][A
Building vocab from file:  41%|████      | 11147103/27368767 [00:28<00:25, 638382.34it/s][A
Building vocab from file:  41%|████      | 11212323/27368767 [00:28<00:25, 626597.10it/s][A
Building vocab from file:  41%|████      | 11275999/27368767 [00:28<00:25, 628581.01it/s][A
Building vocab from file:  41%|████▏     | 11339570/27368767 [00:28<00:25, 626182.64it/s][A
Building vocab from file:  42%|████▏     | 11402690/27368767 [00:28<00:25, 624420.17it/s][A
Building vocab from file:  42%|████▏     | 11467406/27368767 [00:29<00:25, 631072.27it/s][A
Building vocab from file:  42%|████▏     | 11531645/27368767 [00:29<00:24, 634424.33it/s][A
Building vocab from file:  42%|████▏     | 11595276/27368767 [00:29<00:24, 633093.83it/s][A
Building vocab from file:  43%|████▎     | 11659522/27368767 [00:29<00:24, 635873.26it/s][A
Building vocab from file:  43%|████▎     | 11723205/27368767 [00:29<00

Building vocab from file:  60%|██████    | 16473118/27368767 [00:41<00:16, 653720.12it/s][A
Building vocab from file:  60%|██████    | 16541643/27368767 [00:41<00:16, 662869.88it/s][A
Building vocab from file:  61%|██████    | 16608025/27368767 [00:41<00:17, 629573.60it/s][A
Building vocab from file:  61%|██████    | 16671398/27368767 [00:41<00:17, 628022.19it/s][A
Building vocab from file:  61%|██████    | 16737693/27368767 [00:41<00:16, 638107.18it/s][A
Building vocab from file:  61%|██████▏   | 16802218/27368767 [00:42<00:16, 640232.19it/s][A
Building vocab from file:  62%|██████▏   | 16866414/27368767 [00:45<02:38, 66245.02it/s] [A
Building vocab from file:  62%|██████▏   | 16913897/27368767 [00:45<01:57, 89296.43it/s][A
Building vocab from file:  62%|██████▏   | 16965760/27368767 [00:45<01:27, 118800.00it/s][A
Building vocab from file:  62%|██████▏   | 17019505/27368767 [00:45<01:06, 155027.96it/s][A
Building vocab from file:  62%|██████▏   | 17073945/27368767 [00:45<00:

Building vocab from file:  79%|███████▊  | 21522505/27368767 [00:56<00:10, 562637.14it/s][A
Building vocab from file:  79%|███████▉  | 21581495/27368767 [00:56<00:10, 570547.05it/s][A
Building vocab from file:  79%|███████▉  | 21640392/27368767 [00:56<00:09, 575922.13it/s][A
Building vocab from file:  79%|███████▉  | 21698268/27368767 [00:59<01:36, 58939.39it/s] [A
Building vocab from file:  79%|███████▉  | 21747256/27368767 [00:59<01:10, 80070.44it/s][A
Building vocab from file:  80%|███████▉  | 21800488/27368767 [00:59<00:51, 107459.01it/s][A
Building vocab from file:  80%|███████▉  | 21855066/27368767 [00:59<00:38, 141566.75it/s][A
Building vocab from file:  80%|████████  | 21909310/27368767 [00:59<00:30, 181893.49it/s][A
Building vocab from file:  80%|████████  | 21962886/27368767 [01:00<00:23, 226841.71it/s][A
Building vocab from file:  80%|████████  | 22015586/27368767 [01:00<00:19, 273589.36it/s][A
Building vocab from file:  81%|████████  | 22067375/27368767 [01:00<00:

Building vocab from file:  95%|█████████▌| 26010485/27368767 [01:11<00:02, 455526.58it/s][A
Building vocab from file:  95%|█████████▌| 26061603/27368767 [01:11<00:02, 470905.80it/s][A
Building vocab from file:  95%|█████████▌| 26112281/27368767 [01:11<00:02, 481122.67it/s][A
Building vocab from file:  96%|█████████▌| 26160714/27368767 [01:17<00:43, 28017.88it/s] [A
Building vocab from file:  96%|█████████▌| 26210645/27368767 [01:17<00:29, 39085.58it/s][A
Building vocab from file:  96%|█████████▌| 26260972/27368767 [01:17<00:20, 54037.92it/s][A
Building vocab from file:  96%|█████████▌| 26310884/27368767 [01:17<00:14, 73773.91it/s][A
Building vocab from file:  96%|█████████▋| 26361722/27368767 [01:17<00:10, 99220.46it/s][A
Building vocab from file:  97%|█████████▋| 26411203/27368767 [01:17<00:07, 130526.21it/s][A
Building vocab from file:  97%|█████████▋| 26460011/27368767 [01:17<00:05, 167292.17it/s][A
Building vocab from file:  97%|█████████▋| 26507816/27368767 [01:17<00:04,

Length of vocab: 5777375
User count: 5714563
Subreddit count: 62812
User to politic counts: 3322
[('JobieWanKenobi', Counter({'republican': 1})), ('Iowa_Hawkeye', Counter({'republican': 2})), ('Blue387', Counter({'republican': 2})), ('_Hospitaller_', Counter({'republican': 1})), ('RogueHippie', Counter({'republican': 1})), ('SuicidalTendies', Counter({'republican': 1})), ('Znut55', Counter({'republican': 1})), ('rollingrock16', Counter({'republican': 1})), ('Immigrants_go_home', Counter({'republican': 1})), ('nukesiliconvalleyplz', Counter({'republican': 1}))]
Saw political affiliations for 3321 users from flairs
Number of democrats: 408
Number of republicans: 2913
Saw political affiliations for 8476 users from comments
Number of democrats: 5850
Number of republicans: 2626
User to politics training size: {}: 10487
User to politics validation size: {}: 1165


Converting data to PyTorch: 100%|██████████| 5714563/5714563 [08:21<00:00, 11395.92it/s]


Train size: 147791342 Validation size: 16421260


In [5]:
reps, dems = 0, 0

for user, label in pol_validation.items():
    if label == 0:
        dems += 1
    else:
        reps += 1

total = reps + dems
print("Percent republicans: {}".format(reps/total))
print("Percent democrats: {}".format(dems/total))

Percent republicans: 0.46008583690987126
Percent democrats: 0.5399141630901287


## Sklearn Dummy Classifiers

In [6]:
user_ids, pol_labels = [], []

for user, pol_label in pol_validation.items():
    try:
        user_ids.append(dataset.user_to_idx[user])
        pol_labels.append(pol_label)
    except KeyError:
        pass

In [7]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(user_ids, pol_labels)
y_pred = dummy_clf.predict(user_ids)
print("Dummy mean accuracy: {}".format(dummy_clf.score(user_ids, pol_labels)))
print("F1 Score (Macro): {}".format(f1_score(y_pred, pol_labels, average='macro')))

Dummy mean accuracy: 0.5352380952380953
F1 Score (Macro): 0.34863523573200994


  'recall', 'true', average, warn_for)


# Single Month Analysis Not Using Predictions

In [8]:
def read_political_affiliations(files):
    user_to_politic_counts = defaultdict(Counter)

    for fname in tqdm(files):
        with open(fname, 'rt') as f:
            for line in f:
                user, politics, freq = line.split('\t')
                user_to_politic_counts[user][politics] += int(freq)

    print("User to politic counts: " + str(len(user_to_politic_counts)))
    print(list(user_to_politic_counts.items())[:10])

    user_to_politics = {}
    for u, pc in user_to_politic_counts.items():
        if len(pc) > 1:
            continue
        user_to_politics[u] = list(pc.keys())[0]

    print('Saw political affiliations for %d users' % len(user_to_politics))
    return convert_affiliations_to_binary(user_to_politics)


def convert_affiliations_to_binary(user_to_politics):
    for user, politics in user_to_politics.items():
        if politics == "Democrat":
            user_to_politics[user] = 0
        else:
            user_to_politics[user] = 1

    return user_to_politics

flair_directory = "/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/" + year_month + ".tsv"
flair_files = glob.glob(flair_directory)
user_to_politics = read_political_affiliations(flair_files)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


User to politic counts: 3322
[('JobieWanKenobi', Counter({'Republican': 1})), ('Iowa_Hawkeye', Counter({'Republican': 2})), ('Blue387', Counter({'Republican': 2})), ('_Hospitaller_', Counter({'Republican': 1})), ('RogueHippie', Counter({'Republican': 1})), ('SuicidalTendies', Counter({'Republican': 1})), ('Znut55', Counter({'Republican': 1})), ('rollingrock16', Counter({'Republican': 1})), ('Immigrants_go_home', Counter({'Republican': 1})), ('nukesiliconvalleyplz', Counter({'Republican': 1}))]
Saw political affiliations for 3321 users


In [12]:
# Distribution
reps, dems = 0, 0
for k, v in user_to_politics.items():
    if v == 1:
        reps += 1
    else:
        dems += 1
        
total = reps + dems
print("Percent republicans: {}".format(reps/total))
print("Percent democrats: {}".format(dems/total))

Percent republicans: 0.8771454381210478
Percent democrats: 0.12285456187895212


## Polarized Subreddits

In [10]:
user_subreddits = dataset.user_subreddits
subreddit_scores = defaultdict(lambda:0)
subreddit_counts = Counter()
subreddit_users = defaultdict(set)

MIN_POST_THRESHOLD = 20

for user, score in user_to_politics.items():
    subreddits = user_subreddits[user]
    for sub in subreddits:
        if sub[2:4] != 'u_':
            subreddit_scores[sub] += score
            subreddit_counts[sub] += 1
            subreddit_users[sub].add(user)

norm_sub_scores = {}

for sub, score in subreddit_scores.items():
    count = subreddit_counts[sub]
    if count >= 20:
        norm_sub_scores[sub] = score / count
    
sorted_sub_scores = {k: v for k, v in sorted(norm_sub_scores.items(), key=lambda item: item[1], reverse=True)}
top_results = dict(itertools.islice(sorted_sub_scores.items(), 25))

print("Top 25 right leaning subreddits")

for sub, score in top_results.items():
    print(sub, score, subreddit_counts[sub])

Top 25 right leaning subreddits
r/gundeals 1.0 48
r/walkaway 1.0 39
r/NOWTTYG 1.0 30
r/drumpfisfinished 1.0 47
r/Catholicism 1.0 32
r/DrainTheSwamp 1.0 63
r/TrueChristian 1.0 21
r/TheNewRight 1.0 56
r/DeclineIntoCensorship 1.0 21
r/WhereAreAllTheGoodMen 1.0 22
r/keto 1.0 24
r/Republican 1.0 96
r/MGTOW 1.0 50
r/HateCrimeHoaxes 1.0 21
r/metacanada 1.0 27
r/EarthPorn 1.0 25
r/kotakuinaction2 1.0 36
r/whatisthisthing 1.0 21
r/modernwarfare 1.0 43
r/clevercomebacks 1.0 20
r/aviation 1.0 24
r/The_Donald 0.9988276670574443 1706
r/ShitPoliticsSays 0.9925925925925926 135
r/Conservative 0.9885057471264368 696
r/AskThe_Donald 0.9867549668874173 151


In [11]:
sorted_sub_scores = {k: v for k, v in sorted(norm_sub_scores.items(), key=lambda item: item[1], reverse=False)}
top_results = dict(itertools.islice(sorted_sub_scores.items(), 25))

print("Top 25 left leaning subreddits")

for sub, score in top_results.items():
    print(sub, score, subreddit_counts[sub])

Top 25 left leaning subreddits
r/LateStageImperialism 0.043478260869565216 23
r/AskALiberal 0.1896551724137931 232
r/CanadaPolitics 0.2549019607843137 51
r/neoliberal 0.35135135135135137 37
r/ukpolitics 0.4339622641509434 53
r/Gamingcirclejerk 0.52 25
r/ChapoTrapHouse 0.55 20
r/AskAnAmerican 0.5675675675675675 74
r/AskConservatives 0.6 45
r/Ask_Politics 0.6 20
r/unitedkingdom 0.6086956521739131 23
r/menwritingwomen 0.6190476190476191 21
r/SubredditDrama 0.625 32
r/liberalgunowners 0.625 24
r/SelfAwarewolves 0.6428571428571429 28
r/ToiletPaperUSA 0.6428571428571429 28
r/SandersForPresident 0.6585365853658537 41
r/changemyview 0.6619718309859155 71
r/PoliticalDiscussion 0.6666666666666666 54
r/LateStageCapitalism 0.6666666666666666 21
r/TheRightCantMeme 0.696969696969697 33
r/tumblr 0.7 20
r/ThatsInsane 0.7 20
r/moderatepolitics 0.7 30
r/ENLIGHTENEDCENTRISM 0.7073170731707317 41


## Major Subreddit Distributions

In [None]:
import seaborn as sns

major_subreddits = ['r/politics', 'r/pics', 'r/AskReddit', 'r/Conservative', 'r/Liberal']

def plot_sub_scores(subreddit, subreddit_users):
    sns.set_theme(style="darkgrid")
    scores = []
    
    for user in subreddit_users[subreddit]:
        score = user_to_politics[user]
        scores.append(score)
        
    sns.displot(scores).set(title=subreddit)
#     sns.plt.xlim(0, 1)
#     sns.plt.set_title(subreddit)
#     sns.plt.show()

for sub in major_subreddits:
    plot_sub_scores(sub, subreddit_users)

# SVD On Subreddit Embeddings

In [None]:
from sklearn.decomposition import TruncatedSVD

# Load in the model
PATH = '/shared/0/projects/reddit-political-affiliation/working-dir/word2vec-outputs/' + year_month + '/9.pt'
embedding_dim = 50
model = User2Subreddit(dataset.num_users(), embedding_dim, len(dataset.subreddit_to_idx))
model.load_state_dict(torch.load(PATH, map_location=torch.device('cpu')))
print(model.eval())

# Compute SVD on the subreddit embeddings
svd = TruncatedSVD(n_components=10, n_iter=7, random_state=42)
X = model.v_embeddings.weight.detach().numpy()
svd.fit(X)

In [None]:
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)

# Multi-month Analysis Not Using Predictions

In [None]:
# Load in multiple months
year_months = ['2019-04', '2019-05', '2019-06']
models = {}
datasets = {}
for year_month in year_months:
    network_path = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/' + year_month + '_filtered.tsv'
    flair_directory = '/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/' + year_month + '.tsv'

    dataset, training, validation, pol_validation, vocab = build_dataset(network_path, flair_directory)
    PATH = '/shared/0/projects/reddit-political-affiliation/working-dir/word2vec-outputs/' + year_month + '/9.pt'
    embedding_dim = 50
    model = User2Subreddit(dataset.num_users(), embedding_dim, len(dataset.subreddit_to_idx))
    model.load_state_dict(torch.load(PATH, map_location=torch.device('cpu')))
    models[year_month] = model
    datasets[year_month] = dataset

In [None]:
for year_month in year_months:
    dataset = datasets[year_month]
    model = models[year_month]
    user_subreddits = dataset.user_subreddits
    subreddit_scores = defaultdict(lambda:0)
    subreddit_counts = Counter()
    subreddit_users = defaultdict(set)

    MIN_POST_THRESHOLD = 20

    for user, score in user_to_politics.items():
        subreddits = user_subreddits[user]
        for sub in subreddits:
            if sub[2:4] != 'u_':
                subreddit_scores[sub] += score
                subreddit_counts[sub] += 1
                subreddit_users[sub].add(user)

    norm_sub_scores = {}

    for sub, score in subreddit_scores.items():
        count = subreddit_counts[sub]
        if count >= 20:
            norm_sub_scores[sub] = score / count

    sorted_sub_scores = {k: v for k, v in sorted(norm_sub_scores.items(), key=lambda item: item[1], reverse=True)}
    top_results = dict(itertools.islice(sorted_sub_scores.items(), 25))

    print("Top 25 right leaning subreddits for year: {}".format(year_month))

    for sub, score in top_results.items():
        print(sub, score, subreddit_counts[sub])

    sorted_sub_scores = {k: v for k, v in sorted(norm_sub_scores.items(), key=lambda item: item[1], reverse=False)}
    top_results = dict(itertools.islice(sorted_sub_scores.items(), 25))

    print("Top 25 left leaning subreddits for year: {}".format(year_month))

    for sub, score in top_results.items():
        print(sub, score, subreddit_counts[sub])