# Single Month Analysis

## Load in the data and model

In [16]:
import os 
import sys

sys.path.append('/home/kalkiek/projects/reddit-political-affiliation/')

import itertools
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from tqdm.notebook import tqdm
from collections import defaultdict
from sklearn.metrics import accuracy_score, f1_score

from src.data.make_dataset import build_dataset
from src.models.word2vec.User2Subreddit import User2Subreddit

from sklearn.metrics import auc, roc_curve

In [3]:
year_month = '2019-09'

network_path = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/' + year_month + '_filtered.tsv'
flair_directory = '/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/' + year_month + '.tsv'
comment_directory = '/shared/0/projects/reddit-political-affiliation/data/comment-affiliations/*' + year_month + ".tsv"

dataset, training, validation, pol_validation, vocab = build_dataset(network_path, flair_directory, comment_directory)


Building vocab from file:   0%|          | 0/27368767 [00:00<?, ?it/s][A
Building vocab from file:   0%|          | 6005/27368767 [00:00<40:12, 11343.31it/s][A
Building vocab from file:   0%|          | 18664/27368767 [00:01<33:40, 13536.54it/s][A
Building vocab from file:   0%|          | 94452/27368767 [00:01<23:41, 19191.02it/s][A
Building vocab from file:   1%|          | 166666/27368767 [00:01<16:43, 27107.01it/s][A
Building vocab from file:   1%|          | 243988/27368767 [00:01<11:50, 38151.09it/s][A
Building vocab from file:   1%|          | 315855/27368767 [00:01<08:27, 53289.16it/s][A
Building vocab from file:   1%|▏         | 394517/27368767 [00:01<06:04, 73979.47it/s][A
Building vocab from file:   2%|▏         | 468922/27368767 [00:01<04:25, 101365.56it/s][A
Building vocab from file:   2%|▏         | 543923/27368767 [00:01<03:15, 136879.47it/s][A
Building vocab from file:   2%|▏         | 622918/27368767 [00:01<02:26, 182024.55it/s][A
Building vocab from file: 

Building vocab from file:  22%|██▏       | 5956954/27368767 [00:12<00:36, 587094.36it/s][A
Building vocab from file:  22%|██▏       | 6016340/27368767 [00:12<00:37, 569395.00it/s][A
Building vocab from file:  22%|██▏       | 6076970/27368767 [00:13<00:36, 578947.51it/s][A
Building vocab from file:  22%|██▏       | 6138285/27368767 [00:13<00:36, 588800.14it/s][A
Building vocab from file:  23%|██▎       | 6197523/27368767 [00:14<02:08, 165092.51it/s][A
Building vocab from file:  23%|██▎       | 6254053/27368767 [00:14<01:40, 209610.95it/s][A
Building vocab from file:  23%|██▎       | 6303560/27368767 [00:14<01:23, 253453.56it/s][A
Building vocab from file:  23%|██▎       | 6360681/27368767 [00:14<01:09, 304224.03it/s][A
Building vocab from file:  23%|██▎       | 6426854/27368767 [00:14<00:57, 363068.92it/s][A
Building vocab from file:  24%|██▎       | 6488721/27368767 [00:14<00:50, 414435.54it/s][A
Building vocab from file:  24%|██▍       | 6552868/27368767 [00:14<00:44, 463666

Building vocab from file:  42%|████▏     | 11436252/27368767 [00:25<00:26, 612285.39it/s][A
Building vocab from file:  42%|████▏     | 11497977/27368767 [00:25<00:25, 613764.36it/s][A
Building vocab from file:  42%|████▏     | 11559423/27368767 [00:25<00:25, 613676.14it/s][A
Building vocab from file:  42%|████▏     | 11622834/27368767 [00:25<00:25, 619665.11it/s][A
Building vocab from file:  43%|████▎     | 11686716/27368767 [00:25<00:25, 625188.93it/s][A
Building vocab from file:  43%|████▎     | 11749280/27368767 [00:26<02:07, 122168.38it/s][A
Building vocab from file:  43%|████▎     | 11801460/27368767 [00:27<01:38, 158611.02it/s][A
Building vocab from file:  43%|████▎     | 11858901/27368767 [00:27<01:16, 202610.04it/s][A
Building vocab from file:  44%|████▎     | 11919839/27368767 [00:27<01:00, 253342.68it/s][A
Building vocab from file:  44%|████▍     | 11981395/27368767 [00:27<00:50, 307652.32it/s][A
Building vocab from file:  44%|████▍     | 12039658/27368767 [00:27<00

Building vocab from file:  61%|██████▏   | 16803608/27368767 [00:39<00:22, 466314.89it/s][A
Building vocab from file:  62%|██████▏   | 16866563/27368767 [00:39<00:20, 505647.01it/s][A
Building vocab from file:  62%|██████▏   | 16929555/27368767 [00:39<00:19, 537450.99it/s][A
Building vocab from file:  62%|██████▏   | 16991516/27368767 [00:39<00:18, 558314.12it/s][A
Building vocab from file:  62%|██████▏   | 17054286/27368767 [00:39<00:17, 577462.97it/s][A
Building vocab from file:  63%|██████▎   | 17116402/27368767 [00:39<00:17, 589909.98it/s][A
Building vocab from file:  63%|██████▎   | 17178496/27368767 [00:39<00:17, 593210.91it/s][A
Building vocab from file:  63%|██████▎   | 17239994/27368767 [00:39<00:16, 599419.82it/s][A
Building vocab from file:  63%|██████▎   | 17301475/27368767 [00:39<00:16, 599261.99it/s][A
Building vocab from file:  63%|██████▎   | 17363790/27368767 [00:39<00:16, 606232.34it/s][A
Building vocab from file:  64%|██████▎   | 17425187/27368767 [00:40<00

Building vocab from file:  80%|████████  | 21953968/27368767 [00:52<00:10, 527108.83it/s][A
Building vocab from file:  80%|████████  | 22010614/27368767 [00:52<00:09, 538326.57it/s][A
Building vocab from file:  81%|████████  | 22067327/27368767 [00:53<00:09, 546652.78it/s][A
Building vocab from file:  81%|████████  | 22123986/27368767 [00:53<00:09, 551191.22it/s][A
Building vocab from file:  81%|████████  | 22180510/27368767 [00:53<00:09, 542922.75it/s][A
Building vocab from file:  81%|████████  | 22235814/27368767 [00:53<00:09, 543340.28it/s][A
Building vocab from file:  81%|████████▏ | 22290856/27368767 [00:53<00:09, 541348.74it/s][A
Building vocab from file:  82%|████████▏ | 22345489/27368767 [00:53<00:09, 537128.44it/s][A
Building vocab from file:  82%|████████▏ | 22399558/27368767 [00:53<00:09, 532921.68it/s][A
Building vocab from file:  82%|████████▏ | 22454616/27368767 [00:53<00:09, 538098.86it/s][A
Building vocab from file:  82%|████████▏ | 22509266/27368767 [00:53<00

Building vocab from file:  96%|█████████▌| 26288612/27368767 [01:11<00:03, 348109.69it/s][A
Building vocab from file:  96%|█████████▌| 26329805/27368767 [01:11<00:02, 365075.24it/s][A
Building vocab from file:  96%|█████████▋| 26373846/27368767 [01:11<00:02, 384822.98it/s][A
Building vocab from file:  97%|█████████▋| 26415715/27368767 [01:11<00:02, 394391.48it/s][A
Building vocab from file:  97%|█████████▋| 26459771/27368767 [01:12<00:02, 407192.60it/s][A
Building vocab from file:  97%|█████████▋| 26503176/27368767 [01:12<00:02, 414891.79it/s][A
Building vocab from file:  97%|█████████▋| 26545824/27368767 [01:12<00:02, 409211.10it/s][A
Building vocab from file:  97%|█████████▋| 26587574/27368767 [01:12<00:01, 411278.50it/s][A
Building vocab from file:  97%|█████████▋| 26629778/27368767 [01:12<00:01, 414448.71it/s][A
Building vocab from file:  97%|█████████▋| 26671764/27368767 [01:12<00:01, 416057.22it/s][A
Building vocab from file:  98%|█████████▊| 26715411/27368767 [01:12<00

Length of vocab: 5777375
User count: 5714563
Subreddit count: 62812
User to politic counts: 3322
[('JobieWanKenobi', Counter({'republican': 1})), ('Iowa_Hawkeye', Counter({'republican': 2})), ('Blue387', Counter({'republican': 2})), ('_Hospitaller_', Counter({'republican': 1})), ('RogueHippie', Counter({'republican': 1})), ('SuicidalTendies', Counter({'republican': 1})), ('Znut55', Counter({'republican': 1})), ('rollingrock16', Counter({'republican': 1})), ('Immigrants_go_home', Counter({'republican': 1})), ('nukesiliconvalleyplz', Counter({'republican': 1}))]
Saw political affiliations for 3321 users from flairs
Number of democrats: 408
Number of republicans: 2913
Saw political affiliations for 8476 users from comments
Number of democrats: 5850
Number of republicans: 2626
User to politics training size: {}: 10487
User to politics validation size: {}: 1165


Converting data to PyTorch: 100%|██████████| 5714563/5714563 [08:38<00:00, 11024.93it/s]


Train size: 147791342 Validation size: 16421260


In [5]:
path = '/shared/0/projects/reddit-political-affiliation/data/word2vec/dataset/' + year_month
dataset.load_id_mappings(path)

# We'll also need these
word_to_ix = {word: i for i, word in enumerate(vocab)}
all_subreddits = {v for v in vocab if v[:2] == 'r/' and v[2:4] != 'u_'}
print("# of subreddits: " + str(len(all_subreddits)))

Loading user and subreddit id mappings
Loading user subreddits
# of subreddits: 60276


In [6]:
# Load in the actual model
PATH = '/shared/0/projects/reddit-political-affiliation/working-dir/word2vec-outputs/' + year_month + '/9.pt'
embedding_dim = 50

# Sorry for the hardcoding ... will update later
model = User2Subreddit(dataset.num_users(), embedding_dim, len(dataset.subreddit_to_idx))
model.load_state_dict(torch.load(PATH, map_location=torch.device('cpu')))
model.eval()

User2Subreddit(
  (u_embeddings): Embedding(5714563, 50)
  (v_embeddings): Embedding(62812, 50)
  (political_layer): Linear(in_features=50, out_features=1, bias=True)
  (before_pol_dropout): Dropout(p=0.5, inplace=False)
)

## Model Accuracy on Political Validation Set

In [19]:
user_ids, pol_labels = [], []

for user, pol_label in pol_validation.items():
    try:
        # User subreddit dataset spans 1 month. Political data spans the year. Some users might not be present
        user_ids.append(dataset.user_to_idx[user])
        pol_labels.append(pol_label)
    except KeyError:
        pass

user_ids = torch.LongTensor(user_ids)
pol_labels = torch.FloatTensor(pol_labels)

emb_p = model.u_embeddings(user_ids)
political_predictions = model.political_layer(emb_p)
political_predictions = torch.sigmoid(political_predictions)

preds = []
for val in political_predictions.detach().numpy():
    if val[0] >= 0.5:
        preds.append(1)
    else:
        preds.append(0)

labels = pol_labels.detach().numpy().astype(int)
print(accuracy_score(labels, preds))
print(f1_score(labels, preds, average='macro'))

0.5276190476190477
0.4662643836009527


### Question 1 - Are most users left leaning or right leaning

In [8]:
def predict_user_affiliations(model, dataset):
    user_predictions = {}
    loader = DataLoader(dataset, batch_size=512)

    idx_to_user = {v: k for k, v in dataset.user_to_idx.items()}

    for i, data in enumerate(tqdm(loader, desc="Predicting user politics", total=len(dataset) / 512)):
        user_sub, politics_labels, subreddit_labels = data
        user_ids = user_sub[:, 0]
    
        subreddit_ids = user_sub[:, 1]
        
        _, pol_preds = model(user_ids, subreddit_ids, political_user_ids=user_ids)
        
        user_ids = user_ids.detach().numpy()
        
        for j in range(len(user_ids)):
            user = idx_to_user[user_ids[j]]
            user_predictions[user] = pol_preds[j][0]

    return user_predictions

user_predictions = predict_user_affiliations(model, dataset)

HBox(children=(FloatProgress(value=0.0, description='Predicting user politics', max=320727.73828125, style=Pro…




KeyboardInterrupt: 

### Save the predictions for later

In [None]:
out_dir = '/shared/0/projects/reddit-political-affiliation/data/word2vec/predictions/users_' + year_month + '.tsv'

with open(out_dir, 'w') as f:
    for user, score in user_predictions.items():
        f.write("{}\t{}\n".format(user, score))

### Load in previous predictions

In [25]:
in_dir = '/shared/0/projects/reddit-political-affiliation/data/word2vec/predictions/users_' + year_month + '.tsv'
user_predictions = {}

with open(in_dir, 'r') as f:
    for line in f:
        user, score = line.split('\t')
        user_predictions[user] = float(score.strip())

In [39]:
threshold = 0.5
left_count, right_count = 0., 0.

for user, prediction in user_predictions.items():    
    if prediction >= threshold:
        right_count += 1
    else:
        left_count += 1
        
total = len(user_predictions)
percent_right = right_count / total
percent_left = left_count / total

print("Percent of users left leaning: {}".format(percent_left))
print("Percent of users right leaning: {}".format(percent_right))

Percent of users left leaning: 0.16876335752994798
Percent of users right leaning: 0.831236642470052


### Question 2 - Political Leanings of Major Subreddits

Find the top left and right subreddits

In [40]:
from collections import defaultdict, Counter

# Number of scores a subreddit needs before we include it in this analysis
MIN_POST_THRESHOLD = 1000

subreddit_scores = defaultdict(lambda:0)
subreddit_counts = Counter()
subreddit_users = defaultdict(set)


for user, prediction in user_predictions.items():
    subreddits = user_subreddits[user]
    for sub in subreddits:
        if sub[2:4] != 'u_':
            subreddit_scores[sub] += prediction
            subreddit_counts[sub] += 1
            subreddit_users[sub].add(user)
            
        
norm_sub_scores = {}

for sub, score in subreddit_scores.items():
    count = subreddit_counts[sub]
    if count >= MIN_POST_THRESHOLD:
        norm_sub_scores[sub] = score / count
    

sorted_sub_scores = {k: v for k, v in sorted(norm_sub_scores.items(), key=lambda item: item[1], reverse=True)}
top_results = dict(itertools.islice(sorted_sub_scores.items(), 50))

print("Top 50 right leaning subreddits")

for sub, score in top_results.items():
    print(sub, score, subreddit_counts[sub])

Top 50 right leaning subreddits
r/twinks tensor(0.9797, grad_fn=<DivBackward0>) 3247
r/ETHplode tensor(0.9766, grad_fn=<DivBackward0>) 1142
r/dubai tensor(0.9707, grad_fn=<DivBackward0>) 1928
r/Shadowverse tensor(0.9677, grad_fn=<DivBackward0>) 1480
r/spiderbro tensor(0.9662, grad_fn=<DivBackward0>) 1496
r/The_Donald tensor(0.9641, grad_fn=<DivBackward0>) 44652
r/INDYCAR tensor(0.9636, grad_fn=<DivBackward0>) 1611
r/wrestling tensor(0.9563, grad_fn=<DivBackward0>) 1113
r/subredditcancer tensor(0.9548, grad_fn=<DivBackward0>) 2057
r/MilitaryGfys tensor(0.9537, grad_fn=<DivBackward0>) 1855
r/TributeMe tensor(0.9532, grad_fn=<DivBackward0>) 1126
r/FinancialCareers tensor(0.9526, grad_fn=<DivBackward0>) 1938
r/The100 tensor(0.9522, grad_fn=<DivBackward0>) 1207
r/TrueOffMyChest tensor(0.9507, grad_fn=<DivBackward0>) 12213
r/TheDragonPrince tensor(0.9490, grad_fn=<DivBackward0>) 1219
r/donthelpjustfilm tensor(0.9486, grad_fn=<DivBackward0>) 4047
r/4chan tensor(0.9465, grad_fn=<DivBackward0>)

In [41]:
sorted_sub_scores = {k: v for k, v in sorted(norm_sub_scores.items(), key=lambda item: item[1])}
top_results = dict(itertools.islice(sorted_sub_scores.items(), 100))

print("Top 50 left leaning subreddits")

for sub, score in top_results.items():
    print(sub, score, subreddit_counts[sub])

Top 50 left leaning subreddits
r/sbubby tensor(0.2048, grad_fn=<DivBackward0>) 13760
r/LingeriePlus tensor(0.2107, grad_fn=<DivBackward0>) 1294
r/xboxone tensor(0.2260, grad_fn=<DivBackward0>) 33623
r/thepromisedneverland tensor(0.2273, grad_fn=<DivBackward0>) 1669
r/awakened tensor(0.2427, grad_fn=<DivBackward0>) 1334
r/gamecollecting tensor(0.2570, grad_fn=<DivBackward0>) 4452
r/DuggarsSnark tensor(0.2619, grad_fn=<DivBackward0>) 2168
r/caps tensor(0.2660, grad_fn=<DivBackward0>) 3192
r/mechanical_gifs tensor(0.2688, grad_fn=<DivBackward0>) 5890
r/DrugStashes tensor(0.2820, grad_fn=<DivBackward0>) 2216
r/NoStupidQuestions tensor(0.2857, grad_fn=<DivBackward0>) 58227
r/nintendo tensor(0.2933, grad_fn=<DivBackward0>) 10368
r/identifythisfont tensor(0.2945, grad_fn=<DivBackward0>) 1004
r/whatsthisplant tensor(0.2975, grad_fn=<DivBackward0>) 10235
r/brisbane tensor(0.3036, grad_fn=<DivBackward0>) 3285
r/ufc tensor(0.3076, grad_fn=<DivBackward0>) 6898
r/AdventuresOfSabrina tensor(0.3120, 

### Question 3 - Political Distribution of Major Subreddits

In [None]:
import seaborn as sns

major_subreddits = ['r/politics', 'r/pics', 'r/AskReddit', 'r/Conservative', 'r/Liberal']

def plot_sub_scores(subreddit, subreddit_users):
    sns.set_theme(style="darkgrid")
    scores = []
    
    for user in subreddit_users[subreddit]:
        score = user_predictions[user].detach().numpy()
        scores.append(score)
        
    sns.displot(scores).set(title=subreddit)
#     sns.plt.xlim(0, 1)
#     sns.plt.set_title(subreddit)
#     sns.plt.show()

for sub in major_subreddits:
    plot_sub_scores(sub, subreddit_users)