# Word2Vec

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math

import numpy as np

torch.manual_seed(1)

from torch.utils.data import Dataset, DataLoader

import json
import glob
import itertools
from os.path import basename
from collections import *
from tqdm.notebook import tqdm
import networkx as nx
import random
import pickle
import tensorflow as tf
from collections import Counter
from tensorboardX import SummaryWriter

year = '2018'

# Data processing

# One month of data for development testing

In [4]:
directory = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/' + year + '*_filtered.tsv'
files = glob.glob(directory)[:1]
vocab = set()

# Target is the subreddit and context is the users
user_context = defaultdict(set)
all_subreddits = set()

for fname in tqdm(files, desc='Processing all files'):
        with open(fname, 'rt') as f:
            lines = f.readlines()

        for line in tqdm(lines, position=1, desc='Building vocab from file'):
            user, subreddit, freq = line[:-1].split('\t')
            vocab.add(user)
            vocab.add(subreddit)
            user_context[user].add(subreddit)
            all_subreddits.add(subreddit)
            
            
all_subreddits = list(all_subreddits)
print("Length of vocab: " + str(len(vocab)))
print("User count: " + str(len(user_context)))
print("Subreddit count: " + str(len(all_subreddits)))

HBox(children=(FloatProgress(value=0.0, description='Processing all files', max=1.0, style=ProgressStyle(descr…

HBox(children=(FloatProgress(value=0.0, description='Building vocab from file', max=19865760.0, style=Progress…



Length of vocab: 4227361
User count: 4159637
Subreddit count: 67724


# Entire year of data

In [None]:
directory = '/shared/0/projects/reddit-political-affiliation/data/bipartite-networks/' + year + '*_filtered.tsv'
files = glob.glob(directory)
vocab = set()

# Target is the subreddit and context is the users
user_context = defaultdict(set)
all_subreddits = set()

for fname in tqdm(files, desc='Processing all files'):
        with open(fname, 'rt') as f:
            lines = f.readlines()

        for line in tqdm(lines, position=1, desc='Building vocab from file'):
            user, subreddit, freq = line[:-1].split('\t')
            vocab.add(user)
            vocab.add(subreddit)
            user_context[user].add(subreddit)
            all_subreddits.add(subreddit)
            
all_subreddits = list(all_subreddits)
print("Length of vocab: " + str(len(vocab)))
print("User count: " + str(len(user_context)))
print("Subreddit count: " + str(len(all_subreddits)))

# Read in political affliations

In [5]:
files = glob.glob('/shared/0/projects/reddit-political-affiliation/data/flair-affiliations/20*.tsv')

user_to_politic_counts = defaultdict(Counter)

for fname in tqdm(files):
    with open(fname, 'rt') as f:
        for line in f:
            user, politics, freq = line.split('\t')
            user_to_politic_counts[user][politics] += int(freq)
            
print("User to politic counts: " + str(len(user_to_politic_counts)))
print(list(user_to_politic_counts.items())[:10])


user_to_politics = {}
for u, pc in user_to_politic_counts.items():
    if len(pc) > 1:
        continue
    user_to_politics[u] = list(pc.keys())[0]
print('Saw political affiliations for %d users' % len(user_to_politics))

HBox(children=(FloatProgress(value=0.0, max=164.0), HTML(value='')))


User to politic counts: 7832
[('unitedstates', Counter({'Republican': 36})), ('ixid', Counter({'Democrat': 77})), ('TheMG', Counter({'Democrat': 12})), ('MoosPalang', Counter({'Democrat': 2})), ('well_here_I_am', Counter({'Republican': 46})), ('madwilliamflint', Counter({'Republican': 3})), ('lannister80', Counter({'Democrat': 7})), ('dcgh96', Counter({'Republican': 12})), ('G-3-R', Counter({'Republican': 9})), ('Eat_The_Muffin', Counter({'Republican': 3}))]
Saw political affiliations for 7775 users


## Convert to binary labels

In [6]:
for user, politics in user_to_politics.items():
    if politics == "Democrat":
        user_to_politics[user] = 0
    else:
        user_to_politics[user] = 1

## Create a custom dataset class for easier batching

In [7]:
class SubredditUserDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, user_to_subreddits, all_subreddits, user_to_politics, \
                 num_negative_samples=5, max_users=-1):

        self.pos_and_neg_samples = []
        # Mappings to the embedding dimensions
        self.user_to_idx = {}
        self.subreddit_to_idx = {}        
        
        def get_sub_idx(subreddit):
            if subreddit in self.subreddit_to_idx:
                sub_idx = self.subreddit_to_idx[subreddit]
            else:
                sub_idx = len(self.subreddit_to_idx)
                self.subreddit_to_idx[subreddit] = len(self.subreddit_to_idx)            
            return sub_idx
        
        num_users = len(user_context) if max_users < 0 else max_users
        #c = Counter()
        for i, (user, subreddits) in enumerate(tqdm(user_context.items(), total=num_users)):
            if i >= num_users:
                break
            
            if user in user_to_politics:
                politics = user_to_politics[user]
            else:
                politics = -1
            #c[politics]+=1
            
            self.user_to_idx[user] = len(self.user_to_idx)
            user_idx = self.user_to_idx[user]
            
            # Add all the positive samples
            for subreddit in subreddits:
                sub_idx = get_sub_idx(subreddit)
                self.pos_and_neg_samples.append((np.array([user_idx, sub_idx]), politics, 1))
                
            # Choose fixed negative samples 
            neg = []
            num_neg = len(subreddits)*num_negative_samples
            # guard against super active users?
            num_neg = min(num_neg, len(all_subreddits) - num_neg)
            while len(neg) < num_neg:
                sub = all_subreddits[random.randint(0, len(all_subreddits) - 1)]
                if sub not in subreddits: # Check if also in neg?
                    neg.append(sub)
            for n in neg:
                sub_idx = get_sub_idx(subreddit)
                self.pos_and_neg_samples.append((np.array([user_idx, sub_idx]), politics, 0))
        #print(c)
    def num_users(self):
        return len(self.user_to_idx)

    def num_subreddits(self):
        return len(self.subreddit_to_idx)    
    
    def __len__(self):
        return len(self.pos_and_neg_samples)

    def __getitem__(self, idx):
        return self.pos_and_neg_samples[idx]

In [8]:
# Create the training data
training_data = SubredditUserDataset(user_context, all_subreddits, user_to_politics, 
                                     max_users=-1)

HBox(children=(FloatProgress(value=0.0, max=4159637.0), HTML(value='')))




# Multi-task User2Subreddit model that also predicts political affiliation

In [10]:
class User2Subreddit(nn.Module):
    
    def __init__(self, num_users, emb_dimension, num_subreddits):
        super(User2Subreddit, self).__init__()
        self.num_users = num_users
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(num_users, emb_dimension)
        self.v_embeddings = nn.Embedding(num_subreddits, emb_dimension)
        self.political_layer = nn.Linear(emb_dimension, 1)
        self.init_emb()
        
    def init_emb(self):
        initrange = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-initrange, initrange)
        self.political_layer.weight.data.uniform_(-initrange, initrange)

    def forward(self, user_id, subreddit_id, political_user_ids):
        emb_u = self.u_embeddings(user_id)
        emb_v = self.v_embeddings(subreddit_id)
        #print(emb_u.shape)
        #print(emb_v.shape)
        
        # reshape to support batch dot-product
        #emb_u = emb_u.view(emb_u.shape[0], 1, emb_u.shape[1])
        #emb_v = emb_v.view(emb_v.shape[0], emb_v.shape[1], 1)       
        #score = torch.bmm(emb_u, emb_v)
        
        # This this seems like the fastest way to do batch dot product:
        # https://github.com/pytorch/pytorch/issues/18027
        score = (emb_u*emb_v).sum(-1)
        
        #print(score.shape)
        score = torch.sigmoid(score)
        
        # If we have political users to predict for
        if political_user_ids.sum() > 0:
            emb_p = self.u_embeddings(political_user_ids)
            political_predictions = self.political_layer(emb_p)
            political_predictions = torch.sigmoid(political_predictions)
        else:
            political_predictions = None
            
        return score, political_predictions

In [12]:
EMBEDDING_DIM = 50

device = torch.device("cuda:6") # Check with GPU is free with nvidia-smi

#model = CBOW(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE).to(device)
model = User2Subreddit(training_data.num_users(), EMBEDDING_DIM, training_data.num_subreddits()).to(device)

# model = nn.DataParallel(model, device_ids=GPU_IDS)
optimizer = optim.AdamW(model.parameters(), lr=0.0001)

word_to_ix = {word: i for i, word in enumerate(vocab)}
loss_function = nn.BCELoss()
#pol_model = Political(EMBEDDING_DIM).to(device)

#pol_optimizer = optim.AdamW(pol_model.parameters(), lr=0.001)

## Run tensorboard 

In [None]:
#%load_ext tensorboard
#%tensorboard --logdir scalar/word2vec  --port 8010

# Train the model

In [None]:
batch_size=2000
EPOCHS = 50

dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)

writer = SummaryWriter(logdir='/shared/0/projects/reddit-political-affiliation/tensorboard-logs/')



for epoch in tqdm(range(EPOCHS), desc='Epoch'):
    total_loss, pol_loss = 0, 0

    for i, data in enumerate(tqdm(dataloader, total=len(training_data)/batch_size), 1):
        optimizer.zero_grad()

        user_sub, politics_labels, subreddit_labels = data    
        
        # print(politics_labels)

        user_ids = user_sub[:,0]
        subreddit_ids = user_sub[:,1].to(device)
        
        # Grab the user IDs for those that had political labels
        p_indices = [ i for i, v in enumerate(politics_labels) if v >= 0 ]
        political_ids = user_ids.index_select(0, torch.LongTensor(p_indices))
        
        user_ids = user_ids.to(device)
        political_ids = political_ids.to(device)
        
        subreddit_preds, pol_preds = model(user_ids, subreddit_ids, political_ids)
        
        #print(preds.shape)
        #print(labels.shape)
        #print('preds: ', preds)
        #print('labes: ', labels)
        
        subreddit_labels = subreddit_labels.float().to(device)
        
        loss = loss_function(subreddit_preds, subreddit_labels)
        
        # If we had some political users in this batch...
        if len(p_indices) > 0:
            pol_labels = torch.LongTensor([ v for v in politics_labels if v >= 0 ]).float().to(device)        
            
            # Squeeze call necessary to go from (k, 1) to (k) dimensions due to batching
            pol_loss = loss_function(pol_preds.squeeze(), pol_labels)
            
            #print(pol_labels.shape, pol_preds.shape)
            
            writer.add_scalar('political loss', pol_loss.cpu().detach().numpy(),
                              i*batch_size + epoch*len(training_data))
            loss += pol_loss
        
        loss.backward()
        optimizer.step()
        
            
        writer.add_scalar('word2vec loss', loss.cpu().detach().numpy(),
                          i*batch_size + epoch*len(training_data))

        if i % 1000 == 0:
            print('Loss at step %d: %f' % (i, loss.cpu().detach().numpy()))
            
        #break

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=50.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, max=59597.28), HTML(value='')))

Loss at step 1000: 1.303576
Loss at step 2000: 1.190714
Loss at step 3000: 1.141858
Loss at step 4000: 1.019509
Loss at step 5000: 0.883069
Loss at step 6000: 0.755527
Loss at step 7000: 0.754547
Loss at step 8000: 0.576015
Loss at step 9000: 0.650929
Loss at step 10000: 0.564770
Loss at step 11000: 0.619074
Loss at step 12000: 0.537853
Loss at step 13000: 0.449081
Loss at step 14000: 0.549803
Loss at step 15000: 0.406726
Loss at step 16000: 0.424104
Loss at step 17000: 0.418756
Loss at step 18000: 0.448778
Loss at step 19000: 0.379684
Loss at step 20000: 0.355999
Loss at step 21000: 0.354347
Loss at step 22000: 0.344518
Loss at step 23000: 0.356950
Loss at step 24000: 0.363686
Loss at step 25000: 0.499905
Loss at step 26000: 0.323146
Loss at step 27000: 0.351469
Loss at step 28000: 0.326195
Loss at step 29000: 0.321812
Loss at step 30000: 0.329887
Loss at step 31000: 0.288581
Loss at step 32000: 0.292715
Loss at step 33000: 0.325919
Loss at step 34000: 0.297657
Loss at step 35000: 0.2

HBox(children=(FloatProgress(value=0.0, max=59597.28), HTML(value='')))

Loss at step 1000: 0.238878
Loss at step 2000: 0.244191
Loss at step 3000: 0.226319
Loss at step 4000: 0.226505
Loss at step 5000: 0.240329
Loss at step 6000: 0.238354
Loss at step 7000: 0.216779
Loss at step 8000: 0.237228
Loss at step 9000: 0.228516
Loss at step 10000: 0.234125
Loss at step 11000: 0.253471
Loss at step 12000: 0.221960
Loss at step 13000: 0.227634
Loss at step 14000: 0.244345
Loss at step 15000: 0.219512
Loss at step 16000: 0.222610
Loss at step 17000: 0.205084
Loss at step 18000: 0.237933
Loss at step 19000: 0.226117
Loss at step 20000: 0.216706
Loss at step 21000: 0.227124
Loss at step 22000: 0.220710
Loss at step 23000: 0.225798
Loss at step 24000: 0.229004
Loss at step 25000: 0.205975
Loss at step 26000: 0.201250
Loss at step 27000: 0.217797
Loss at step 28000: 0.234789
Loss at step 29000: 0.236761
Loss at step 30000: 0.214761
Loss at step 31000: 0.222077
Loss at step 32000: 0.206324
Loss at step 33000: 0.250539
Loss at step 34000: 0.239995
Loss at step 35000: 0.2

HBox(children=(FloatProgress(value=0.0, max=59597.28), HTML(value='')))

Loss at step 1000: 0.204851
Loss at step 2000: 0.209224
Loss at step 3000: 0.195189
Loss at step 4000: 0.201477
Loss at step 5000: 0.217809
Loss at step 6000: 0.177617
Loss at step 7000: 0.214686
Loss at step 8000: 0.201794
Loss at step 9000: 0.186397
Loss at step 10000: 0.230329
Loss at step 11000: 0.195911
Loss at step 12000: 0.195249
Loss at step 13000: 0.207946
Loss at step 14000: 0.213625
Loss at step 15000: 0.212532
Loss at step 16000: 0.198463
Loss at step 17000: 0.225018
Loss at step 18000: 0.196940
Loss at step 19000: 0.205046
Loss at step 20000: 0.182587
Loss at step 21000: 0.219257
Loss at step 22000: 0.205601
Loss at step 23000: 0.215484
Loss at step 24000: 0.211120
Loss at step 25000: 0.214080
Loss at step 26000: 0.200246
Loss at step 27000: 0.199296
Loss at step 28000: 0.209348
Loss at step 29000: 0.207838
Loss at step 30000: 0.234481
Loss at step 31000: 0.204855
Loss at step 32000: 0.200574
Loss at step 33000: 0.231395
Loss at step 34000: 0.225097
Loss at step 35000: 0.1

In [None]:
EPOCHS = 5
losses = []
writer = SummaryWriter(logdir='scalar/word2vec')

for epoch in tqdm(range(EPOCHS), desc='Epoch'):
    total_loss, pol_loss = 0, 0

    for subreddits, user in tqdm(context_vecs, desc='Processing subreddits for user'):
        context_ids = make_context_vector(subreddits, word_to_ix).to(device)
        
        out_act = model(context_ids)
        
        # Generate 2 negative samples for every positive sample
        negative_samples = generate_negative_samples(user, len(subreddits) * 2)
        negative_ids = make_context_vector(negative_samples, word_to_ix).to(device)
        
        loss = loss_function(out_act, torch.tensor([word_to_ix[user]], dtype=torch.long))
        
        # Update loss function
        for sub_ix in context_ids:
            loss += 1 - torch.sigmoid(out_act[0, sub_ix]) 

        for sub_ix in negative_ids:
            loss += 0 - torch.sigmoid(out_act[0, sub_ix])
            
        # If we know their political affiliation pass it through another linear layer
        if user in user_to_politics:
            
            pred = pol_model(torch.tensor([word_to_ix[user]], dtype=torch.long))
            pol_loss = loss_function(pred, user_to_politics[user])
            # TODO: Review this
            loss += pol_loss
            pol_loss.backward()
            pol_optimizer.step()
                             
        loss.backward()
        optimizer.step()
#         writer.add_scalar('word2vec loss', loss.detach().numpy(), epoch)     
    print(total_loss)
    losses.append(total_loss)
    
writer.close()
losses

In [None]:
 torch.cuda.is_available()

# Embeddings Results

In [None]:
def top_n_similar(subreddit, n):
    cosine_sims = {}
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    sub_tensor = lookup_tensor = torch.tensor([word_to_ix[subreddit]], dtype=torch.long)

    for sub, _ in top_subs.items():
        lookup_tensor = torch.tensor([word_to_ix[sub]], dtype=torch.long)
        result = cos(model.embeddings(sub_tensor), model.embeddings(lookup_tensor))
        cosine_sims[sub] = result
        
    cosine_sims = {k: v for k, v in sorted(cosine_sims.items(), key=lambda item: item[1], reverse=True)}
    return dict(itertools.islice(cosine_sims.items(), n))
    
top_n_similar('r/CryptoCurrency', n=10)

# Save embeddings to TSV

# Predict Political Affiliation 