# Single Month Analysis

## Load in the data and model

In [6]:
import os 
import sys

sys.path.append('/home/kalkiek/projects/reddit-political-affiliation/')

import itertools
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from tqdm.notebook import tqdm
from collections import defaultdict
from sklearn.metrics import accuracy_score

from src.data.make_dataset import build_dataset
from src.models.word2vec.User2Subreddit import User2Subreddit

from sklearn.metrics import auc, roc_curve

In [2]:
year_month = '2019-04'

network_path = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/' + year_month + '_filtered.tsv'
flair_directory = '/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/' + year_month + '.tsv'

dataset, training, validation, pol_validation, vocab = build_dataset(network_path, flair_directory)


Building vocab from file:   0%|          | 0/28132858 [00:00<?, ?it/s][A
Building vocab from file:   0%|          | 49191/28132858 [00:00<00:57, 491905.78it/s][A
Building vocab from file:   0%|          | 63949/28132858 [00:00<07:57, 58758.48it/s] [A
Building vocab from file:   0%|          | 126495/28132858 [00:00<05:47, 80691.87it/s][A
Building vocab from file:   1%|          | 183458/28132858 [00:01<04:17, 108676.27it/s][A
Building vocab from file:   1%|          | 232079/28132858 [00:01<03:16, 141679.76it/s][A
Building vocab from file:   1%|          | 282784/28132858 [00:01<02:34, 180638.57it/s][A
Building vocab from file:   1%|          | 330260/28132858 [00:01<02:05, 221875.09it/s][A
Building vocab from file:   1%|▏         | 382402/28132858 [00:01<01:43, 268076.32it/s][A
Building vocab from file:   2%|▏         | 434637/28132858 [00:01<01:28, 313919.45it/s][A
Building vocab from file:   2%|▏         | 482903/28132858 [00:02<03:48, 121051.23it/s][A
Building vocab fro

Building vocab from file:  18%|█▊        | 5136977/28132858 [00:14<00:38, 596582.29it/s][A
Building vocab from file:  18%|█▊        | 5198721/28132858 [00:14<00:41, 550466.37it/s][A
Building vocab from file:  19%|█▊        | 5260172/28132858 [00:14<00:40, 568232.34it/s][A
Building vocab from file:  19%|█▉        | 5318655/28132858 [00:14<00:41, 550959.53it/s][A
Building vocab from file:  19%|█▉        | 5375028/28132858 [00:14<00:41, 551272.14it/s][A
Building vocab from file:  19%|█▉        | 5431049/28132858 [00:14<00:43, 524103.84it/s][A
Building vocab from file:  19%|█▉        | 5484365/28132858 [00:14<00:43, 523118.80it/s][A
Building vocab from file:  20%|█▉        | 5537312/28132858 [00:16<03:37, 104074.65it/s][A
Building vocab from file:  20%|█▉        | 5594141/28132858 [00:16<02:43, 137857.94it/s][A
Building vocab from file:  20%|██        | 5659250/28132858 [00:16<02:04, 180555.66it/s][A
Building vocab from file:  20%|██        | 5723134/28132858 [00:16<01:37, 229992

Building vocab from file:  38%|███▊      | 10584938/28132858 [00:28<00:26, 661318.86it/s][A
Building vocab from file:  38%|███▊      | 10651364/28132858 [00:28<00:26, 654996.95it/s][A
Building vocab from file:  38%|███▊      | 10717255/28132858 [00:28<00:26, 656162.89it/s][A
Building vocab from file:  38%|███▊      | 10783026/28132858 [00:30<02:47, 103440.39it/s][A
Building vocab from file:  39%|███▊      | 10845000/28132858 [00:30<02:05, 137907.10it/s][A
Building vocab from file:  39%|███▊      | 10896553/28132858 [00:30<01:38, 175395.83it/s][A
Building vocab from file:  39%|███▉      | 10948007/28132858 [00:30<01:18, 218626.06it/s][A
Building vocab from file:  39%|███▉      | 10998776/28132858 [00:30<01:05, 262255.26it/s][A
Building vocab from file:  39%|███▉      | 11061475/28132858 [00:30<00:53, 317698.99it/s][A
Building vocab from file:  40%|███▉      | 11123420/28132858 [00:30<00:45, 372072.02it/s][A
Building vocab from file:  40%|███▉      | 11179799/28132858 [00:30<00

Building vocab from file:  55%|█████▍    | 15352370/28132858 [00:43<00:52, 241652.59it/s][A
Building vocab from file:  55%|█████▍    | 15415914/28132858 [00:43<00:42, 296770.70it/s][A
Building vocab from file:  55%|█████▌    | 15480755/28132858 [00:43<00:35, 354433.99it/s][A
Building vocab from file:  55%|█████▌    | 15542358/28132858 [00:43<00:30, 406177.86it/s][A
Building vocab from file:  55%|█████▌    | 15601816/28132858 [00:43<00:29, 418609.26it/s][A
Building vocab from file:  56%|█████▌    | 15656961/28132858 [00:43<00:28, 435617.89it/s][A
Building vocab from file:  56%|█████▌    | 15710000/28132858 [00:43<00:28, 442082.60it/s][A
Building vocab from file:  56%|█████▌    | 15760865/28132858 [00:43<00:27, 451323.26it/s][A
Building vocab from file:  56%|█████▌    | 15810705/28132858 [00:44<00:26, 460681.53it/s][A
Building vocab from file:  56%|█████▋    | 15860116/28132858 [00:44<00:26, 465987.78it/s][A
Building vocab from file:  57%|█████▋    | 15909258/28132858 [00:44<00

Building vocab from file:  71%|███████   | 19877678/28132858 [00:57<01:52, 73598.30it/s][A
Building vocab from file:  71%|███████   | 19931240/28132858 [00:57<01:22, 99293.09it/s][A
Building vocab from file:  71%|███████   | 19974622/28132858 [00:58<01:03, 129175.89it/s][A
Building vocab from file:  71%|███████   | 20015188/28132858 [00:58<00:50, 162307.10it/s][A
Building vocab from file:  71%|███████▏  | 20055939/28132858 [00:58<00:40, 198059.30it/s][A
Building vocab from file:  71%|███████▏  | 20096530/28132858 [00:58<00:34, 233942.19it/s][A
Building vocab from file:  72%|███████▏  | 20137102/28132858 [00:58<00:30, 264121.13it/s][A
Building vocab from file:  72%|███████▏  | 20176838/28132858 [00:58<00:27, 292124.75it/s][A
Building vocab from file:  72%|███████▏  | 20218443/28132858 [00:58<00:24, 320789.05it/s][A
Building vocab from file:  72%|███████▏  | 20258546/28132858 [00:58<00:23, 337804.16it/s][A
Building vocab from file:  72%|███████▏  | 20299775/28132858 [00:58<00:2

Building vocab from file:  85%|████████▌ | 23936359/28132858 [01:10<00:10, 399894.74it/s][A
Building vocab from file:  85%|████████▌ | 23976720/28132858 [01:10<00:10, 396798.77it/s][A
Building vocab from file:  85%|████████▌ | 24016666/28132858 [01:10<00:10, 388496.93it/s][A
Building vocab from file:  86%|████████▌ | 24055740/28132858 [01:10<00:10, 384269.14it/s][A
Building vocab from file:  86%|████████▌ | 24094335/28132858 [01:10<00:10, 383337.72it/s][A
Building vocab from file:  86%|████████▌ | 24132787/28132858 [01:11<00:10, 376305.93it/s][A
Building vocab from file:  86%|████████▌ | 24181167/28132858 [01:11<00:09, 403178.77it/s][A
Building vocab from file:  86%|████████▌ | 24222101/28132858 [01:11<00:09, 396885.21it/s][A
Building vocab from file:  86%|████████▌ | 24262243/28132858 [01:11<00:09, 388855.74it/s][A
Building vocab from file:  86%|████████▋ | 24302228/28132858 [01:11<00:09, 392087.92it/s][A
Building vocab from file:  87%|████████▋ | 24341691/28132858 [01:11<00

Building vocab from file:  98%|█████████▊| 27449204/28132858 [01:28<00:02, 336723.70it/s][A
Building vocab from file:  98%|█████████▊| 27485592/28132858 [01:28<00:01, 339769.06it/s][A
Building vocab from file:  98%|█████████▊| 27521477/28132858 [01:28<00:01, 343020.59it/s][A
Building vocab from file:  98%|█████████▊| 27557122/28132858 [01:28<00:01, 339085.38it/s][A
Building vocab from file:  98%|█████████▊| 27591981/28132858 [01:28<00:01, 340255.82it/s][A
Building vocab from file:  98%|█████████▊| 27631590/28132858 [01:28<00:01, 355278.54it/s][A
Building vocab from file:  98%|█████████▊| 27669369/28132858 [01:28<00:01, 361743.03it/s][A
Building vocab from file:  98%|█████████▊| 27707607/28132858 [01:29<00:01, 367693.95it/s][A
Building vocab from file:  99%|█████████▊| 27745049/28132858 [01:29<00:01, 369685.05it/s][A
Building vocab from file:  99%|█████████▉| 27782270/28132858 [01:29<00:00, 366518.39it/s][A
Building vocab from file:  99%|█████████▉| 27819797/28132858 [01:29<00

Length of vocab: 5762539
User count: 5697629
Subreddit count: 64910
User to politic counts: 2589
[('WatchMaga', Counter({'Republican': 2})), ('BasedMedicalDoctor', Counter({'Republican': 2})), ('Damemezaredadremez', Counter({'Republican': 2})), ('joeohegna', Counter({'Republican': 1})), ('Deplorable_scum', Counter({'Republican': 2})), ('TheRabbidHD', Counter({'Republican': 2})), ('Paladin-Arda', Counter({'Democrat': 2})), ('Trumpwillalwayswin', Counter({'Republican': 2})), ('daw-nee-yale', Counter({'Republican': 1})), ('BigcatTV', Counter({'Republican': 2}))]
Saw political affiliations for 2588 users
User to politics training size: {}: 2330
User to politics validation size: {}: 258


Converting data to PyTorch: 100%|██████████| 5697629/5697629 [11:05<00:00, 8555.48it/s] 


Train size: 151917434 Validation size: 16879714


In [25]:
path = '/shared/0/projects/reddit-political-affiliation/data/word2vec/dataset/' + year_month
dataset.load_id_mappings(id_mappings_path)

# We'll also need these
word_to_ix = {word: i for i, word in enumerate(vocab)}
all_subreddits = {v for v in vocab if v[:2] == 'r/' and v[2:4] != 'u_'}
print("# of subreddits: " + str(len(all_subreddits)))

Loading user id mappings
Total users: 5697629
Loading subreddit id mappings
Total subreddits: 64910
Loading user subreddits
# of subreddits: 60708


In [13]:
# Load in the actual model
PATH = '/shared/0/projects/reddit-political-affiliation/working-dir/word2vec-outputs/' + year_month + '/9.pt'
embedding_dim = 50

# Sorry for the hardcoding ... will update later
model = User2Subreddit(dataset.num_users(), embedding_dim, len(subreddit_to_idx))
model.load_state_dict(torch.load(PATH, map_location=torch.device('cpu')))
model.eval()

User2Subreddit(
  (u_embeddings): Embedding(5697629, 50)
  (v_embeddings): Embedding(64910, 50)
  (political_layer): Linear(in_features=50, out_features=1, bias=True)
  (before_pol_dropout): Dropout(p=0.5, inplace=False)
)

## Model Accuracy on Political Validation Set

In [26]:
user_ids, pol_labels = [], []

for user, pol_label in pol_validation.items():
    try:
        # User subreddit dataset spans 1 month. Political data spans the year. Some users might not be present
        user_ids.append(dataset.user_to_idx[user])
        pol_labels.append(pol_label)
    except KeyError:
        pass

user_ids = torch.LongTensor(user_ids)
pol_labels = torch.FloatTensor(pol_labels)

emb_p = model.u_embeddings(user_ids)
political_predictions = model.political_layer(emb_p)
political_predictions = torch.sigmoid(political_predictions)

preds = []
for val in political_predictions.detach().numpy():
    if val[0] >= 0.5:
        preds.append(1)
    else:
        preds.append(0)

labels = pol_labels.detach().numpy().astype(int)
accuracy_score(labels, preds)

0.8326180257510729

### Question 1 - Are most users left leaning or right leaning

In [37]:
def predict_user_affiliations(model, dataset):
    user_predictions = {}
    loader = DataLoader(dataset, batch_size=512)

    idx_to_user = {v: k for k, v in dataset.user_to_idx.items()}

    for i, data in enumerate(tqdm(loader, desc="Predicting user politics", total=len(dataset) / 512)):
        user_sub, politics_labels, subreddit_labels = data
        user_ids = user_sub[:, 0]
        subreddit_ids = user_sub[:, 1]
        
        _, pol_preds = model(user_ids, subreddit_ids, political_user_ids=user_ids)
        
        user_ids = user_ids.detach().numpy()
        
        for j in range(len(user_ids)):
            user = idx_to_user[user_ids[j]]
            user_predictions[user] = pol_preds[j][0]

    return user_predictions

user_predictions = predict_user_affiliations(model, dataset)

HBox(children=(HTML(value='Predicting user politics'), FloatProgress(value=0.0, max=329681.9296875), HTML(valu…




### Save the predictions for later

In [38]:
out_dir = '/shared/0/projects/reddit-political-affiliation/data/word2vec/predictions/users_' + year_month + '.tsv'

with open(out_dir, 'w') as f:
    for user, score in user_predictions.items():
        f.write("{}\t{}\n".format(user, score))

### Load in previous predictions

In [25]:
in_dir = '/shared/0/projects/reddit-political-affiliation/data/word2vec/predictions/users_' + year_month + '.tsv'
user_predictions = {}

with open(in_dir, 'r') as f:
    for line in f:
        user, score = line.split('\t')
        user_predictions[user] = float(score.strip())

In [39]:
threshold = 0.5
left_count, right_count = 0., 0.

for user, prediction in user_predictions.items():    
    if prediction >= threshold:
        right_count += 1
    else:
        left_count += 1
        
total = len(user_predictions)
percent_right = right_count / total
percent_left = left_count / total

print("Percent of users left leaning: {}".format(percent_left))
print("Percent of users right leaning: {}".format(percent_right))

Percent of users left leaning: 0.16876335752994798
Percent of users right leaning: 0.831236642470052


### Question 2 - Political Leanings of Major Subreddits

Find the top left and right subreddits

In [40]:
from collections import defaultdict, Counter

# Number of scores a subreddit needs before we include it in this analysis
MIN_POST_THRESHOLD = 1000

subreddit_scores = defaultdict(lambda:0)
subreddit_counts = Counter()
subreddit_users = defaultdict(set)


for user, prediction in user_predictions.items():
    subreddits = user_subreddits[user]
    for sub in subreddits:
        if sub[2:4] != 'u_':
            subreddit_scores[sub] += prediction
            subreddit_counts[sub] += 1
            subreddit_users[sub].add(user)
            
        
norm_sub_scores = {}

for sub, score in subreddit_scores.items():
    count = subreddit_counts[sub]
    if count >= MIN_POST_THRESHOLD:
        norm_sub_scores[sub] = score / count
    

sorted_sub_scores = {k: v for k, v in sorted(norm_sub_scores.items(), key=lambda item: item[1], reverse=True)}
top_results = dict(itertools.islice(sorted_sub_scores.items(), 50))

print("Top 50 right leaning subreddits")

for sub, score in top_results.items():
    print(sub, score, subreddit_counts[sub])

Top 50 right leaning subreddits
r/twinks tensor(0.9797, grad_fn=<DivBackward0>) 3247
r/ETHplode tensor(0.9766, grad_fn=<DivBackward0>) 1142
r/dubai tensor(0.9707, grad_fn=<DivBackward0>) 1928
r/Shadowverse tensor(0.9677, grad_fn=<DivBackward0>) 1480
r/spiderbro tensor(0.9662, grad_fn=<DivBackward0>) 1496
r/The_Donald tensor(0.9641, grad_fn=<DivBackward0>) 44652
r/INDYCAR tensor(0.9636, grad_fn=<DivBackward0>) 1611
r/wrestling tensor(0.9563, grad_fn=<DivBackward0>) 1113
r/subredditcancer tensor(0.9548, grad_fn=<DivBackward0>) 2057
r/MilitaryGfys tensor(0.9537, grad_fn=<DivBackward0>) 1855
r/TributeMe tensor(0.9532, grad_fn=<DivBackward0>) 1126
r/FinancialCareers tensor(0.9526, grad_fn=<DivBackward0>) 1938
r/The100 tensor(0.9522, grad_fn=<DivBackward0>) 1207
r/TrueOffMyChest tensor(0.9507, grad_fn=<DivBackward0>) 12213
r/TheDragonPrince tensor(0.9490, grad_fn=<DivBackward0>) 1219
r/donthelpjustfilm tensor(0.9486, grad_fn=<DivBackward0>) 4047
r/4chan tensor(0.9465, grad_fn=<DivBackward0>)

In [41]:
sorted_sub_scores = {k: v for k, v in sorted(norm_sub_scores.items(), key=lambda item: item[1])}
top_results = dict(itertools.islice(sorted_sub_scores.items(), 100))

print("Top 50 left leaning subreddits")

for sub, score in top_results.items():
    print(sub, score, subreddit_counts[sub])

Top 50 left leaning subreddits
r/sbubby tensor(0.2048, grad_fn=<DivBackward0>) 13760
r/LingeriePlus tensor(0.2107, grad_fn=<DivBackward0>) 1294
r/xboxone tensor(0.2260, grad_fn=<DivBackward0>) 33623
r/thepromisedneverland tensor(0.2273, grad_fn=<DivBackward0>) 1669
r/awakened tensor(0.2427, grad_fn=<DivBackward0>) 1334
r/gamecollecting tensor(0.2570, grad_fn=<DivBackward0>) 4452
r/DuggarsSnark tensor(0.2619, grad_fn=<DivBackward0>) 2168
r/caps tensor(0.2660, grad_fn=<DivBackward0>) 3192
r/mechanical_gifs tensor(0.2688, grad_fn=<DivBackward0>) 5890
r/DrugStashes tensor(0.2820, grad_fn=<DivBackward0>) 2216
r/NoStupidQuestions tensor(0.2857, grad_fn=<DivBackward0>) 58227
r/nintendo tensor(0.2933, grad_fn=<DivBackward0>) 10368
r/identifythisfont tensor(0.2945, grad_fn=<DivBackward0>) 1004
r/whatsthisplant tensor(0.2975, grad_fn=<DivBackward0>) 10235
r/brisbane tensor(0.3036, grad_fn=<DivBackward0>) 3285
r/ufc tensor(0.3076, grad_fn=<DivBackward0>) 6898
r/AdventuresOfSabrina tensor(0.3120, 

### Question 3 - Political Distribution of Major Subreddits

In [None]:
import seaborn as sns

major_subreddits = ['r/politics', 'r/pics', 'r/AskReddit', 'r/Conservative', 'r/Liberal']

def plot_sub_scores(subreddit, subreddit_users):
    sns.set_theme(style="darkgrid")
    scores = []
    
    for user in subreddit_users[subreddit]:
        score = user_predictions[user].detach().numpy()
        scores.append(score)
        
    sns.displot(scores).set(title=subreddit)
#     sns.plt.xlim(0, 1)
#     sns.plt.set_title(subreddit)
#     sns.plt.show()

for sub in major_subreddits:
    plot_sub_scores(sub, subreddit_users)