# Word2Vec

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math

torch.manual_seed(1)

import json
import glob
import itertools
from os.path import basename
from collections import *
from tqdm.notebook import tqdm
import networkx as nx
import random
import pickle
import tensorflow as tf
from collections import Counter
from tensorboardX import SummaryWriter

year = '2018'

# Data processing

# One month of data for development testing

In [2]:
directory = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/' + year + '*_filtered.tsv'
files = glob.glob(directory)[:1]
vocab = set()

# Target is the subreddit and context is the users
user_context = defaultdict(set)
all_subreddits = set()

for fname in tqdm(files, desc='Processing all files'):
        with open(fname, 'rt') as f:
            lines = f.readlines()

        for line in tqdm(lines, position=1, desc='Building vocab from file'):
            user, subreddit, freq = line[:-1].split('\t')
            vocab.add(user)
            vocab.add(subreddit)
            user_context[user].add(subreddit)
            all_subreddits.add(subreddit)
            
            
all_subreddits = list(all_subreddits)
print("Length of vocab: " + str(len(vocab)))
print("User count: " + str(len(user_context)))
print("Subreddit count: " + str(len(all_subreddits)))

HBox(children=(FloatProgress(value=0.0, description='Processing all files', max=1.0, style=ProgressStyle(descr…

HBox(children=(FloatProgress(value=0.0, description='Building vocab from file', max=19865760.0, style=Progress…



Length of vocab: 4227361
User count: 4159637
Subreddit count: 67724


# Entire year of data

In [None]:
directory = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/' + year + '*_filtered.tsv'
files = glob.glob(directory)
vocab = set()

# Target is the subreddit and context is the users
user_context = defaultdict(set)
all_subreddits = set()

for fname in tqdm(files, desc='Processing all files'):
        with open(fname, 'rt') as f:
            lines = f.readlines()

        for line in tqdm(lines, position=1, desc='Building vocab from file'):
            user, subreddit, freq = line[:-1].split('\t')
            vocab.add(user)
            vocab.add(subreddit)
            user_context[user].add(subreddit)
            all_subreddits.add(subreddit)
            
all_subreddits = list(all_subreddits)
print("Length of vocab: " + str(len(vocab)))
print("User count: " + str(len(user_context)))
print("Subreddit count: " + str(len(all_subreddits)))

# Read in political affliations

In [3]:
files = glob.glob('/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/20*.tsv')

user_to_politic_counts = defaultdict(Counter)

for fname in tqdm(files):
    with open(fname, 'rt') as f:
        for line in f:
            user, politics, freq = line.split('\t')
            user_to_politic_counts[user][politics] += int(freq)
            
print("User to politic counts: " + str(len(user_to_politic_counts)))
print(list(user_to_politic_counts.items())[:10])


user_to_politics = {}
for u, pc in user_to_politic_counts.items():
    if len(pc) > 1:
        continue
    user_to_politics[u] = list(pc.keys())[0]
print('Saw political affiliations for %d users' % len(user_to_politics))

HBox(children=(FloatProgress(value=0.0, max=164.0), HTML(value='')))


User to politic counts: 7832
[('unitedstates', Counter({'Republican': 36})), ('ixid', Counter({'Democrat': 77})), ('TheMG', Counter({'Democrat': 12})), ('MoosPalang', Counter({'Democrat': 2})), ('well_here_I_am', Counter({'Republican': 46})), ('madwilliamflint', Counter({'Republican': 3})), ('lannister80', Counter({'Democrat': 7})), ('dcgh96', Counter({'Republican': 12})), ('G-3-R', Counter({'Republican': 9})), ('Eat_The_Muffin', Counter({'Republican': 3}))]
Saw political affiliations for 7775 users


## Convert to binary labels

In [61]:
for user, politics in user_to_politics.items():
    if politics == "Democrat":
        user_to_politics[user] = 0
    else:
        user_to_politics[user] = 1

# Train Word2Vec

Subreddits are the context and users are the target i.e. users2subreddit

In [5]:
# Build bag of words context vectors
CONTEXT_SIZE = 10
context_vecs = []
vocab = set()

for user, subs in tqdm(user_context.items()):
    subs = list(subs)
    vocab.add(user)
    [vocab.add(s) for s in subs]
    for i in range(0, len(subs) - CONTEXT_SIZE, CONTEXT_SIZE):
        context = (subs[i:i+CONTEXT_SIZE], user)
        context_vecs.append(context)
        
print(context_vecs[:2])

HBox(children=(FloatProgress(value=0.0, max=4159637.0), HTML(value='')))


[(['r/tifu', 'r/forwardsfromgrandma', 'r/dontdeadopeninside', 'r/badlinguistics', 'r/woahdude', 'r/Minecraft', 'r/assholedesign', 'r/meirl', 'r/civ', 'r/tumblr'], 'firedrake242'), (['r/communism', 'r/math_irl', 'r/esist', 'r/DnDGreentext', 'r/hmmm', 'r/history', 'r/translator', 'r/self', 'r/gifs', 'r/mildlyinteresting'], 'firedrake242')]


In [6]:
def generate_negative_samples(user, n):
    samples = []
    
    while len(samples) < n:
        rand_index = random.randint(0, len(all_subreddits) - 1)
        sub = all_subreddits[rand_index]
        
        if sub not in user_context[user]:
            samples.append(sub)
            
    return samples

# CBOW

In [56]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_size, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear1 = nn.Linear(embedding_size * context_size, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        hidden = self.linear1(embeds)
        out = F.relu(hidden)
        out = self.linear2(out)
        return out
    
    
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

In [57]:
# TODO: Think of a better name for this
class Political(nn.Module):
    
    def __init__(self, embedding_size):
        super(Political, self).__init__()
        self.linear = nn.Linear(embedding_size, 1)
    
    def forward(self, inputs):
        inputs = inputs.view((1, -1))
        out = self.linear(inputs)
        return nn.Sigmoid(out)

In [59]:
EMBEDDING_DIM = 10
model = CBOW(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
# model = nn.DataParallel(model, device_ids=GPU_IDS)
optimizer = optim.SGD(model.parameters(), lr=0.001)

word_to_ix = {word: i for i, word in enumerate(vocab)}
loss_function = nn.NLLLoss()
pol_model = Political(EMBEDDING_DIM)
pol_optimizer = optim.SGD(pol_model.parameters(), lr=0.001)

## Run tensorboard 

In [61]:
%load_ext tensorboard
%tensorboard --logdir scalar/word2vec # --port 6546

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6546 (pid 85105), started 0:04:36 ago. (Use '!kill 85105' to kill it.)

In [60]:
EPOCHS = 5
losses = []
writer = SummaryWriter(logdir='scalar/word2vec')

for epoch in tqdm(range(EPOCHS), desc='Epoch'):
    total_loss, pol_loss = 0, 0

    for subreddits, user in tqdm(context_vecs, desc='Processing subreddits for user'):
        context_ids = make_context_vector(subreddits, word_to_ix)
        out_act = model(context_ids)
        
        # Generate 2 negative samples for every positive sample
        negative_samples = generate_negative_samples(user, len(subreddits) * 2)
        negative_ids = make_context_vector(negative_samples, word_to_ix)
        loss = loss_function(out_act, torch.tensor([word_to_ix[user]], dtype=torch.long))
        
        # Update loss function
        for sub_ix in context_ids:
            loss += 1 - torch.sigmoid(out_act[0, sub_ix]) 

        for sub_ix in negative_ids:
            loss += 0 - torch.sigmoid(out_act[0, sub_ix])
            
        # If we know their political affiliation pass it through another linear layer
        if user in user_to_politics:
            
            pred = pol_model(torch.tensor([word_to_ix[user]], dtype=torch.long))
            pol_loss = loss_function(pred, user_to_politics[user])
            # TODO: Review this
            loss += pol_loss
            pol_loss.backward()
            pol_optimizer.step()
                             
        loss.backward()
        optimizer.step()
#         writer.add_scalar('word2vec loss', loss.detach().numpy(), epoch)     
    print(total_loss)
    losses.append(total_loss)
    
writer.close()
losses

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Processing subreddits for user', max=719347.0, style=Prog…





KeyboardInterrupt: 

# Embeddings Results

In [None]:
def top_n_similar(subreddit, n):
    cosine_sims = {}
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    sub_tensor = lookup_tensor = torch.tensor([word_to_ix[subreddit]], dtype=torch.long)

    for sub, _ in top_subs.items():
        lookup_tensor = torch.tensor([word_to_ix[sub]], dtype=torch.long)
        result = cos(model.embeddings(sub_tensor), model.embeddings(lookup_tensor))
        cosine_sims[sub] = result
        
    cosine_sims = {k: v for k, v in sorted(cosine_sims.items(), key=lambda item: item[1], reverse=True)}
    return dict(itertools.islice(cosine_sims.items(), n))
    
top_n_similar('r/CryptoCurrency', n=10)

# Save embeddings to TSV

# Predict Political Affiliation 