In [82]:
import torch   
from torch import nn
import torch.optim as optim
from torchtext import data  
import pandas as pd
from torchtext.legacy.data import Field, LabelField, TabularDataset, BucketIterator
from collections import defaultdict, Counter
from tqdm.notebook import tqdm

SEED = 2021
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f83a40f2a90>

In [7]:
basedir='/shared/0/projects/reddit-political-affiliation/data/username-labels/'

train_data_source = basedir + 'user2label.train.csv'
test_data_source = basedir + 'user2label.dev.csv'

train_df = pd.read_csv(train_data_source)
dev_df = pd.read_csv(test_data_source)

train_df.head()

Unnamed: 0,username,label,binary_label
0,Tuco_bell,Republican,0
1,twentyeyedfiend,Republican,0
2,joadbrotherfollower,Democrat,1
3,420philcollins666,Democrat,1
4,mheran,Democrat,1


In [83]:
len(train_df), Counter(train_df['binary_label'])

(157419, Counter({0: 58915, 1: 98504}))

In [90]:
TEXT = Field(tokenize=list, batch_first=True, include_lengths=True)
LABEL = LabelField(dtype = torch.float, batch_first=True)

In [94]:
fields = [('text',TEXT), ('label', LABEL), (None, None), ]

#loading custom dataset
training_data = TabularDataset(basedir + 'user2label.train.csv',format = 'csv',
                                  fields = fields, skip_header = True)

dev_data = TabularDataset(basedir + 'user2label.dev.csv',format = 'csv',
                                  fields = fields, skip_header = True)

test_data = TabularDataset(basedir + 'user2label.test.csv',format = 'csv',
                                  fields = fields, skip_header = True)

#print preprocessed text
print(vars(training_data.examples[0]))

{'text': ['T', 'u', 'c', 'o', '_', 'b', 'e', 'l', 'l'], 'label': 'Republican'}


In [95]:
print(vars(training_data.examples[2]))

{'text': ['j', 'o', 'a', 'd', 'b', 'r', 'o', 't', 'h', 'e', 'r', 'f', 'o', 'l', 'l', 'o', 'w', 'e', 'r'], 'label': 'Democrat'}


In [96]:
#initialize glove embeddings
TEXT.build_vocab(training_data, min_freq=1)  
LABEL.build_vocab(training_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
print(TEXT.vocab.stoi)   

Size of TEXT vocabulary: 66
Size of LABEL vocabulary: 2
[('e', 160655), ('a', 138139), ('o', 112436), ('r', 107462), ('i', 103414), ('n', 99179), ('t', 91631), ('s', 83651), ('l', 73721), ('h', 52384)]
defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f83a560c910>>, {'<unk>': 0, '<pad>': 1, 'e': 2, 'a': 3, 'o': 4, 'r': 5, 'i': 6, 'n': 7, 't': 8, 's': 9, 'l': 10, 'h': 11, 'd': 12, 'u': 13, 'c': 14, 'm': 15, 'y': 16, 'g': 17, '_': 18, 'p': 19, 'k': 20, 'b': 21, 'f': 22, 'w': 23, '1': 24, 'S': 25, 'T': 26, 'M': 27, '2': 28, 'v': 29, '0': 30, 'A': 31, 'B': 32, 'C': 33, 'D': 34, 'R': 35, 'z': 36, 'P': 37, '3': 38, '9': 39, '4': 40, 'L': 41, 'x': 42, 'j': 43, 'G': 44, '7': 45, 'I': 46, 'F': 47, '8': 48, 'N': 49, '-': 50, 'E': 51, 'O': 52, 'H': 53, '6': 54, '5': 55, 'W': 56, 'J': 57, 'K': 58, 'U': 59, 'V': 60, 'Y': 61, 'q': 62, 'Z': 63, 'X': 64, 'Q': 65})


In [97]:
#check whether cuda is available
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 128

#Load an iterator
train_iterator, valid_iterator = BucketIterator.splits(
    (training_data, dev_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [98]:
class UsernameClassifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [124]:
size_of_vocab = len(TEXT.vocab)
embedding_dim = 15
num_hidden_nodes = 256
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = UsernameClassifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

In [125]:
#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [126]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in tqdm(iterator):
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [127]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:       
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [132]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1230.0), HTML(value='')))


	Train Loss: 0.661 | Train Acc: 62.59%
	 Val. Loss: 0.664 |  Val. Acc: 62.08%


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1230.0), HTML(value='')))


	Train Loss: 0.660 | Train Acc: 62.61%
	 Val. Loss: 0.664 |  Val. Acc: 62.09%


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1230.0), HTML(value='')))


	Train Loss: 0.660 | Train Acc: 62.62%
	 Val. Loss: 0.664 |  Val. Acc: 62.07%


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1230.0), HTML(value='')))


	Train Loss: 0.660 | Train Acc: 62.59%
	 Val. Loss: 0.663 |  Val. Acc: 62.09%


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1230.0), HTML(value='')))


	Train Loss: 0.660 | Train Acc: 62.63%
	 Val. Loss: 0.664 |  Val. Acc: 62.08%


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1230.0), HTML(value='')))


	Train Loss: 0.660 | Train Acc: 62.66%
	 Val. Loss: 0.663 |  Val. Acc: 62.12%


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1230.0), HTML(value='')))


	Train Loss: 0.659 | Train Acc: 62.67%
	 Val. Loss: 0.664 |  Val. Acc: 62.09%


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1230.0), HTML(value='')))


	Train Loss: 0.657 | Train Acc: 62.74%
	 Val. Loss: 0.665 |  Val. Acc: 62.04%


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1230.0), HTML(value='')))


	Train Loss: 0.655 | Train Acc: 62.85%
	 Val. Loss: 0.665 |  Val. Acc: 61.94%


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1230.0), HTML(value='')))


	Train Loss: 0.651 | Train Acc: 63.08%
	 Val. Loss: 0.670 |  Val. Acc: 61.58%


In [129]:
# Check out some of the predictions
model.eval()

from collections import defaultdict
dev_pred_df = defaultdict(list)

#deactivates autograd
with torch.no_grad():

    for batch in tqdm(valid_iterator):

        #retrieve text and no. of words
        text, text_lengths = batch.text

        #convert to 1d tensor
        predictions = model(text, text_lengths).squeeze()

        for i, cids in enumerate(text.cpu()):
            username = ''.join([TEXT.vocab.itos[cid] for cid in cids])
            pred = predictions[i].cpu().item()
            dev_pred_df['username'].append(username)
            dev_pred_df['prediction'].append(pred)
        
        #print(text)
        #break
dev_pred_df = pd.DataFrame(dev_pred_df)
dev_pred_df.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=154.0), HTML(value='')))




Unnamed: 0,username,prediction
0,FM4k,0.41906
1,curf,0.403236
2,K503,0.410251
3,ep0k,0.402365
4,woze,0.391493


In [130]:
dev_pred_df = dev_pred_df.sort_values(by = 'prediction')
dev_pred_df.head(50)

Unnamed: 0,username,prediction
1718,bllasae,0.356522
15073,VyseLegendaire,0.3591
15991,blueberry_crepe,0.360223
4630,burmieee,0.360729
824,VOBone,0.360738
3660,blakepoe,0.360809
7255,blad_saras,0.36112
6949,VortexLine,0.361201
13105,bluesiswhoiam,0.361244
6564,bawlsaque,0.361702


In [131]:
dev_pred_df.tail(50)

Unnamed: 0,username,prediction
1510,AG9090,0.439232
16431,420canadiangreen,0.439361
5587,197328645,0.439385
5324,N0N-R0B0T,0.439474
95,zn95,0.439485
108,957<pad>,0.440031
7442,7448342501,0.440097
112,559<pad>,0.44017
142,51674,0.440209
13690,404NotFounded,0.440222
